In [1]:
from psaw import PushshiftAPI
from pymongo import MongoClient, errors
import pandas as pd
import numpy as np

from bson.json_util import dumps

import boto3

In [2]:
api = PushshiftAPI()

client = MongoClient('localhost', 27017)
db = client['cap2']

In [3]:
# get the user activity
usr_actvty = pd.read_csv('data/user_activity_dataframe.csv')
usr_actvty.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
# fetch comments and submissions for each suspicious user
activity_types = ['comment']
for i, api_search in enumerate([api.search_comments]):
    typ = activity_types[i]
    table = db[typ]
    # drop the collection if it exists already
    table.drop()
    # create it, enforcing unique key ids
    table.create_index('id', unique=True)
    users = np.unique(usr_actvty[usr_actvty['type'] == typ]['username'])
    for usr in users:
        print(f'searching for comments by {usr} ...')
        gen = api_search(author=usr)
        for a in gen:
            try:
                table.insert_one(a.d_)
            except errors.DuplicateKeyError:
                continue

searching for comments by 1488Reasons ...
searching for comments by AdoraronDoomworker ...
searching for comments by AlsagelvBuriron ...
searching for comments by AndromajurusAragrel ...
searching for comments by AriundisVugar ...
searching for comments by AxeseekerMightwind ...
searching for comments by BeazerneMem ...
searching for comments by BerskyN ...
searching for comments by BlackToLive ...
searching for comments by BleepThePolice ...
searching for comments by CererinKarn ...
searching for comments by DalabandisAndroma ...
searching for comments by DalahelmFelharil ...
searching for comments by DarusCege ...
searching for comments by Dazzle6 ...
searching for comments by DeusXYX ...
searching for comments by DorothieBell ...
searching for comments by FaurnFlamebreaker ...
searching for comments by FreddieGi ...
searching for comments by GaranaTobar ...
searching for comments by GavinraraFonara ...
searching for comments by GoltijindGoltishura ...
searching for comments by Graha

In [4]:
for coll in db.list_collection_names():
    print(f'{coll:14}: {db[coll].count_documents({}):6}')

submission    :  14523
first_and_last:    952
comment       :   6704


In [24]:
# save results to s3
bucket_name = 'cfdv-classify-reddit-trolls'
filepath = '/home/ubuntu/src/troll_classifier/data/6704_comments_by_suspicious_users.jsonarr'
filename = 'susp_users_comments/6704_comments_by_suspicious_users.jsonarr'
s3_client = boto3.client('s3')
s3_client.upload_file(filepath, bucket_name, filename)

In [11]:
import praw
creds = {}
with open ('/opt/cap1/.cap1', 'r') as fp:
    for line in fp:
        k, v = line.replace('\n','').split('\t')
        creds[k] = v
reddit = praw.Reddit(client_id=creds['REDDIT_ID'], 
    client_secret=creds['REDDIT_SECRET'],
    password=creds['REDDIT_PASSWORD'], 
    username=creds['REDDIT_USERNAME'],
    user_agent='accessAPI:v0.0.1 (by /u/{})'.format(creds['REDDIT_USERNAME']))

In [34]:
submission_id = '100ikq'
submission = reddit.submission(id=submission_id)
submission.comments._comments

[Comment(id='c69bqat'),
 Comment(id='c69bv9c'),
 Comment(id='c69chu6'),
 Comment(id='c69cjhd'),
 Comment(id='c69dfwh'),
 Comment(id='c69f3xq'),
 Comment(id='c69f5b1'),
 Comment(id='c69g25b'),
 Comment(id='c69i7gm'),
 Comment(id='c69jww5'),
 Comment(id='c69lt3f'),
 Comment(id='c69m0qi'),
 Comment(id='c69pqgn'),
 Comment(id='c69cl6w')]

In [38]:
comment = submission.comments._comments[0]

In [50]:
d = dict(vars(comment))
d

{'_replies': <praw.models.comment_forest.CommentForest at 0x7f3539b2e6a0>,
 '_submission': Submission(id='100ikq'),
 '_reddit': <praw.reddit.Reddit at 0x7f35643d1490>,
 'total_awards_received': 0,
 'approved_at_utc': None,
 'ups': 5,
 'awarders': [],
 'mod_reason_by': None,
 'banned_by': None,
 'author_flair_type': 'text',
 'removal_reason': None,
 'link_id': 't3_100ikq',
 'author_flair_template_id': None,
 'likes': None,
 'user_reports': [],
 'saved': False,
 'id': 'c69bqat',
 'banned_at_utc': None,
 'mod_reason_title': None,
 'gilded': 0,
 'archived': True,
 'no_follow': False,
 'author': Redditor(name='rikker_'),
 'can_mod_post': False,
 'send_replies': True,
 'parent_id': 't3_100ikq',
 'score': 5,
 'author_fullname': 't2_6ey3m',
 'report_reasons': None,
 'approved_by': None,
 'all_awardings': [],
 'subreddit_id': 't5_2skv6',
 'body': 'Prepare yourself for several "FTL for Borderlands 2" offers! :P',
 'edited': False,
 'downs': 0,
 'author_flair_css_class': 'mod-rikker',
 'is_submit

In [47]:
keys_to_remove = ['_replies', '_submission', '_reddit', 'mod']
d = dict(vars(comment))
for key in keys_to_remove:
    _ = d.pop(key, None)
d

{'total_awards_received': 0,
 'approved_at_utc': None,
 'ups': 5,
 'awarders': [],
 'mod_reason_by': None,
 'banned_by': None,
 'author_flair_type': 'text',
 'removal_reason': None,
 'link_id': 't3_100ikq',
 'author_flair_template_id': None,
 'likes': None,
 'user_reports': [],
 'saved': False,
 'id': 'c69bqat',
 'banned_at_utc': None,
 'mod_reason_title': None,
 'gilded': 0,
 'archived': True,
 'no_follow': False,
 'author': Redditor(name='rikker_'),
 'can_mod_post': False,
 'send_replies': True,
 'parent_id': 't3_100ikq',
 'score': 5,
 'author_fullname': 't2_6ey3m',
 'report_reasons': None,
 'approved_by': None,
 'all_awardings': [],
 'subreddit_id': 't5_2skv6',
 'body': 'Prepare yourself for several "FTL for Borderlands 2" offers! :P',
 'edited': False,
 'downs': 0,
 'author_flair_css_class': 'mod-rikker',
 'is_submitter': False,
 'collapsed': False,
 'author_flair_richtext': [],
 'author_patreon_flair': False,
 'body_html': '<div class="md"><p>Prepare yourself for several &quot;FTL