In [2]:
from psaw import PushshiftAPI
import praw
from prawcore import Forbidden
from praw.exceptions import ClientException
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns

from scipy import stats

from pymongo import MongoClient, errors

from bson.json_util import loads, dumps
from bson.objectid import ObjectId

from datetime import datetime as dt

import boto3

import time

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_confusion_matrix

from imblearn.ensemble import BalancedRandomForestClassifier

from scipy import interp

import string
from pprint import pprint

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

In [5]:
creds = {}
with open ('/opt/cap1/.cap1', 'r') as fp:
    for line in fp:
        k, v = line.replace('\n','').split('\t')
        creds[k] = v

In [6]:
reddit = praw.Reddit(client_id=creds['REDDIT_ID'], 
    client_secret=creds['REDDIT_SECRET'],
    password=creds['REDDIT_PASSWORD'], 
    username=creds['REDDIT_USERNAME'],
    user_agent='accessAPI:v0.0.1 (by /u/{})'.format(creds['REDDIT_USERNAME']))

In [7]:
api = PushshiftAPI()

In [8]:
client = MongoClient('localhost', 27017)
db = client['cap2']

In [9]:
# get all comments into a list called 'docs', then make dataframe
query = {}
fields = { '_id': 0 }
# docs is about 1.3 GB in RAM
docs = list(db['comment'].find( query, fields ))

In [10]:
comms = pd.DataFrame(docs)

In [None]:
# TODO add submission ids to the set of ids from trolls, probably append 't1_' to comments
# for ease of use

## assign labels, 'troll?', and 'child_of_troll?', and 'parent_of_troll?'

In [11]:
# create array of classes for target
classes = ['other', 'parent', 'child', 'troll']

In [12]:
# assign the label, 'troll?'' to each comment
troll_comment_ids_set = set(comms[~comms['author'].isna()]['id'])
comms['troll?'] = [int(mybool) for mybool in [
                    commid in troll_comment_ids_set for commid in comms['id']
                    ]]
np.sum(comms['troll?'])

6704

In [13]:
# is this comment in reply to a troll?
comms['child_of_troll?'] = [int(mybool) for mybool in [
                    pid.split('_')[1] in troll_comment_ids_set for pid in comms['parent_id']
                    ]]
np.sum(comms['child_of_troll?'])

2985

### only 57, praw never gave us children of troll comments...the only ones we have are from trolls replying to trolls

#### FIXED, used praw to get the troll comments, then used .refresh() to load the replies. worked ok

In [14]:
# did a troll reply to this?
troll_parent_ids_set = set(
                            [p.split('_')[1] for p in comms[~comms['author'].isna()]['parent_id']]
                        )
comms['parent_of_troll?'] = [int(myid in troll_parent_ids_set) for myid in comms['id']]

np.sum(comms['parent_of_troll?'])

1824

In [15]:
np.unique([p.split('_')[0] for p in np.unique(comms[~comms['author'].isna()]['parent_id'])], return_counts=True)

(array(['t1', 't3'], dtype='<U2'), array([1839, 4348]))

#### hm, 1144 comments are parents of trolls, but there are 1839 't1'-style parent_ids among troll comments. Which of the unique parent ids aren't getting labeled as parent?

In [16]:
parent_ids = np.unique(comms[~comms['author'].isna()]['parent_id'])
len(parent_ids)

6187

In [17]:
t1_parent_ids=[]
for p in parent_ids:
    if p[:2]=='t1':
        t1_parent_ids.append(p.split('_')[1])
set_parent_ids = set(t1_parent_ids)
set_parent_of_troll = set(comms[comms['parent_of_troll?']==1]['id'])
# how does a differ from 
len(set_parent_ids), len(set_parent_of_troll)

(1839, 1824)

In [18]:
# here are the 695 comment ids that probably need to be set as parent_of_troll
missing_commentids = np.array(t1_parent_ids)[[myid not in set_parent_of_troll for myid in t1_parent_ids]]

In [19]:
# are these comments simply missing from the data? YES
# FIXED, only 15 remain after starting with 1824 ish
num_hits = []
for cid in missing_commentids:
    num_hits.append(len(comms[comms['id']==cid]))
np.unique(num_hits, return_counts=True)

(array([0]), array([15]))

In [20]:
len(missing_commentids)

15

In [21]:
def printall(pd_obj):
    '''
    print every row and column in a pandas object
    '''
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(pd_obj)

In [22]:
# print the first non-troll comment, making sure to 
r = np.where(comms['author'].isna())
printall(comms.iloc[r[0], :].iloc[0,:])

author                                                                           NaN
author_created_utc                                                               NaN
author_flair_css_class                                                    mod-rikker
author_flair_text                  http://steamcommunity.com/profiles/76561198054...
author_fullname                                                             t2_6ey3m
body                               Prepare yourself for several "FTL for Borderla...
controversiality                                                                   0
created_utc                                                              1.34787e+09
distinguished                                                                   None
gilded                                                                             0
id                                                                           c69bqat
link_id                                                          

In [169]:
for coll in db.list_collection_names():
    print(f'{coll:14}: {db[coll].count_documents({}):6}')

submission    :  14523
first_and_last:    964
comment       : 230853


In [26]:
','.join(missing_commentids)

'c55cs4v,c5dwrdv,c69cm8a,c6eunvf,c78b3gw,c7b8vqn,c7bbduz,c7besrl,c7bguhc,c7c9krf,c7efzzm,c7his1t,c7hxdsi,c7i284w,c7if5z9,c7ikkce,c7iocqi,c7ipk1q,c7rzgab,c7wmj1b,c7wmk1b,c7wml2a,c8nt11a,c8ntmh0,c8o7nwz,cd714a1,cdw1dqk,ce1cp59,cestd6k,cfa9d26,cfaa1u5,cfffzjj,cffug3b,cfrt5a9,cfte0f6,cfthw86,cftmb63,cftn2sf,cftnpi7,cfto0bq,cftol0k,cftpl2v,cftptm9,cftzoeo,cg0z82u,cg1momq,cg5p2ru,cgfwynp,cgfxmpj,ch7qkl4,chx1tqc,cit9kt1,cla7hsy,cms6p22,cmx9dim,copkoou,cpfnvgt,cq9nusb,cqe9a6b,cqhtj4q,cqn2sk6,cqpo84t,crud3qp,crv6vb4,crv8y56,crv9qf9,crva5iw,crxk5ok,crxk9wh,crxmakj,cry9dbi,cry9lfz,cry9saj,cryaar6,crybk2d,cryc2jn,crycsmf,crye6rs,cryenpi,cryf4xc,cryfckw,cryjjtr,cs0t3uw,cs39w0u,cs43hro,cs622nd,csbt0mq,csh8dik,cshboie,csoor1q,cszkuf2,cszl0hb,cszl5re,ctd7bv4,cudpmov,cuec9uh,cuewltu,cufl8js,cujkrrl,cujwdox,cuui4li,cv9xyrn,cvfzk0q,cvh4ckj,cvh58gr,cvy704o,cw5aipy,cwas0ak,cwb1g5f,cwzywoo,cx0gufr,cx4g56i,cx7u0p5,cx7uqso,cx7vugn,cx8ombk,cx90vp5,cxceyio,cxcf6ts,cxcflk3,cxcfxef,cxcgeqx,cxcglff,cxcgxu9,cxcj30j

In [170]:
# Reddit's api changed, dict(vars(comment)) only worked after running a ._fetch()
# get missing comments from pushshift
# gen = api.search_comments(id=','.join(missing_commentids))
# gen = api.search_comments(id='c55cs4v')
ids =','.join(missing_commentids[:2])
#print(ids)
#gen = api.search_comments(id=ids)
comment = reddit.comment(id="cqe9a6b")
pprint(vars(comment))
#for c in gen:
#    print(c.d_['body'])
    #db['comment'].insert_one(c.d_)

{'_fetched': False,
 '_reddit': <praw.reddit.Reddit object at 0x7f05277a2b50>,
 '_replies': [],
 '_submission': None,
 'id': 'cqe9a6b'}


In [23]:
# https://snew.notabug.io/r/unixporn/comments/vjbgg/archlinux_dwm/c551ecd/
# comment = reddit.comment(id="c55cs4v")
comment = reddit.comment(id="c551ecd")
# comment = reddit.comment(id="cqe9a6b")
dict(vars(comment))

{'_replies': [],
 '_submission': None,
 '_reddit': <praw.reddit.Reddit at 0x7fdd967f7970>,
 '_fetched': False,
 'id': 'c551ecd'}

In [196]:
# okay, so c55cs4v is here -- 
# https://snew.notabug.io/r/unixporn/comments/vjbgg/archlinux_dwm/c55cs4v/
# and you can see shomyo's reply with permalink -- 
# https://snew.notabug.io/r/unixporn/comments/vjbgg/archlinux_dwm/c55hjc4/

In [24]:
comment._fetch()
dict(vars(comment))

{'_replies': [],
 '_submission': None,
 '_reddit': <praw.reddit.Reddit at 0x7fdd967f7970>,
 '_fetched': True,
 'id': 'c551ecd',
 'total_awards_received': 0,
 'approved_at_utc': None,
 'edited': False,
 'mod_reason_by': None,
 'banned_by': None,
 'author_flair_type': 'text',
 'removal_reason': None,
 'link_id': 't3_vjbgg',
 'author_flair_template_id': None,
 'likes': None,
 'user_reports': [],
 'saved': False,
 'banned_at_utc': None,
 'mod_reason_title': None,
 'gilded': 0,
 'archived': True,
 'no_follow': True,
 'author': Redditor(name='ProbableRepost'),
 'can_mod_post': False,
 'created_utc': 1340577203.0,
 'send_replies': True,
 'parent_id': 't3_vjbgg',
 'score': 7,
 'author_fullname': 't2_5w9yd',
 'approved_by': None,
 'mod_note': None,
 'all_awardings': [],
 'subreddit_id': 't5_2sx2i',
 'body': "Finally some decent font rendering on a non-Ubuntu setup. Care to share?\n\nHowever, maybe it's the resolution, but text seems all-round too big for my taste. Also, I'm not a huge fan of th

In [156]:
# top-up the parent comments, worked great.
log = [] 
i = 0 # had clientexception at 59, id='cqe9a6b'
for comment_id in missing_commentids[i:]: 
    i += 1 
    print(f'searching for comment id: {comment_id}') 
    comment = reddit.comment(id=comment_id) 
    try: 
        comment._fetch() 
    except (Forbidden, ClientException): 
        log.append(i) 
        continue 
    d = dict(vars(comment)) 
    for key in ['_replies', '_submission', 
            '_reddit', 'mod', 'author', 
            'subreddit']: 
        _ = d.pop(key, None) 
    try: 
        db['comment'].insert_one(d) 
    except (errors.DuplicateKeyError, errors.InvalidDocument): 
        log.append[i] 
        continue 

[]

In [205]:
for coll in db.list_collection_names():
    print(f'{coll:14}: {db[coll].count_documents({}):6}')

submission    :  14523
first_and_last:    964
comment       : 231533


In [206]:
# these are the before and after numbers of records in mongo
# 15 records failed
num_records_added = 231533 - 230853
num_records_added

680

In [207]:
# from the ipython session in which I ran the scrape
log = [59, 63, 84, 98, 109, 187, 500, 580, 632, 667, 670, 676, 678, 684, 689]
missing_commentids[log]

array(['cqhtj4q', 'crv6vb4', 'cs43hro', 'cujkrrl', 'cwzywoo', 'cz0c7qt',
       'd4ssri9', 'd6rt3gs', 'dd5b62b', 'dk5x3ny', 'dmk6hhx', 'dndiu8i',
       'dp9fs27', 'drqqc9v', 'dsc2qbw'], dtype='<U7')

### cqhtj4q
```
[–]zenzog3 points 5 years ago 
https://mobile.twitter.com/shoxCSGO/status/588523685419601922
permalink save report give gold reply
```
https://snew.notabug.io/r/csgobetting/comments/3324gu/envyus_vs_dignitas_bo3_190415_2100_cest/cqhtj4q/

### d4ssri9
```
[–][censored]3 points 4 years ago 
[censored]
permalink save report give gold reply[removed by moderators]
```
https://snew.notabug.io/r/rage/comments/4qgm68/woman_26_wielded_hatchet_after_her_demands_for/d4ssri9/

### dp9fs27
```
–][deleted]2 points 3 years ago 
[deleted]
permalink save report give gold reply
```
https://snew.notabug.io/r/Sissies/comments/7ad3vx/any_requests/dp9fs27/

In [26]:
# get the replies to the trolls
# get missing comments from pushshift
troll_commentids = comms[~comms['author'].isna()]['id']
np.random.choice(troll_commentids)

'd2j7hcz'

In [44]:
comment = reddit.comment(np.random.choice(troll_commentids))
comment.refresh()

Comment(id='d659sbh')

In [46]:
for reply in comment._replies:
    print(dict(vars(reply)))

{'_replies': <praw.models.comment_forest.CommentForest object at 0x7fdd76ff6940>, '_submission': Submission(id='4wa6mn'), '_reddit': <praw.reddit.Reddit object at 0x7fdd967f7970>, 'total_awards_received': 0, 'approved_at_utc': None, 'ups': 1, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'author_flair_type': 'text', 'removal_reason': None, 'link_id': 't3_4wa6mn', 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'd65ber0', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': True, 'no_follow': True, 'author': Redditor(name='kempff'), 'can_mod_post': False, 'send_replies': True, 'parent_id': 't1_d659sbh', 'score': 1, 'author_fullname': 't2_6ayqq', 'report_reasons': None, 'approved_by': None, 'all_awardings': [], 'subreddit_id': 't5_2qil2', 'collapsed': True, 'body': "There is a possibility it just did. I noticed the wife was not interviewed. I'm keeping an eye on this story to see if it turns out to be a hoax.", '

In [264]:
for coll in db.list_collection_names():
    print(f'{coll:14}: {db[coll].count_documents({}):6}')

submission    :  14523
first_and_last:    964
comment       : 231533


In [None]:
# top-up the child comments
log = [] 
i = 0 
for comment_id in troll_commentids:  
    i += 1  
    print(f'searching for comment id: {comment_id}')  
    comment = reddit.comment(id=comment_id)  
    try:  
        comment.refresh() 
    except (Forbidden, ClientException):  
        log.append(i)  
        continue 
    for reply in comment._replies: 
        d = dict(vars(reply)) 
        did = d['id'] 
        print(f'found reply id: {did}')  
        for key in ['_replies', '_submission',  
                '_reddit', 'mod', 'author',  
                'subreddit']:  
            _ = d.pop(key, None)  
        try:  
            db['comment'].insert_one(d) 
        except (errors.DuplicateKeyError, errors.InvalidDocument): 
            log.append(i)  
            continue 

In [48]:
for coll in db.list_collection_names():
    print(f'{coll:14}: {db[coll].count_documents({}):6}')

submission    :  14523
first_and_last:    964
comment       : 234008


## NLP

In [None]:
orig_stopwords_ = set("a,able,about,across,after,all,almost,also,am,among,an,and,any,\
are,as,at,be,because,been,but,by,can,could,dear,did,do,does,either,\
else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,\
how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,\
me,might,most,must,my,neither,no,of,off,often,on,only,or,other,our,\
own,rather,said,say,says,she,should,since,so,some,than,that,the,their,\
them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,\
what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,u,s".split(','))

# new stopwords after first run through
# sws_to_add_brf1 = 'thanks,op,tie,deleted,game,crypto,invest,\
# edit,platform,play,giveaway,ok,yeah,blockchain,enter,remove,\
# m,token,awesome,ethereum,exchange,steam,dude'.split(',')

# stopwords_ = set(list(orig_stopwords_) + sws_to_add_brf1)
stopwords_ = orig_stopwords_

punctuation_ = set(string.punctuation + '’' + '“')

def rm_punctuation(a_string):
    table = str.maketrans('', '', ''.join(punctuation_))
    return a_string.translate(table)
        
def tokens_lower(tokens):
    return [word.lower() for word in tokens]

def filter_tokens(tokens):
#     for w in sent: print(w)
#     if w == "’": print("this is ': " + w)
    return [w for w in tokens if not w in stopwords_]

def stem_tokens(ntlk_stem_obj, tokens):
    # visualize what stemming and lemmitization does!
    # str(porter.__class__) = "<class 'nltk.stem.porter.PorterStemmer'>"
#     name = str(ntlk_stem_lemm_obj.__class__).split("'")[1].split('.')[-1]
#     count_ident = 0
#     count_alter = 0
    return [ntlk_stem_obj.stem(tok) for tok in tokens]
#     elif ''
# for tok in tokens:
#         return ntlk_stem_lemm_obj.stem(tok)
#         if tok == stem_lemm_f_l_tok:
#             count_ident += 1
#         else:
#             print(tok, stem_lemm_f_l_tok)
#             count_alter += 1
#     print('{}:\nNumber of unchanged words: {}\nchanged words: {}\n'.format(name, count_ident, count_alter))

def lemm_tokens(ntlk_lemm_obj, tokens):
    return [ntlk_lemm_obj.lemmatize(tok) for tok in tokens]

In [None]:
def nlp_pre_proc_doc(input_string, stemming=False):
    '''
    given document:
        tokenizes the document
        sets tokens to lower case
        filters punctuation and stop words from tokens
        returns porter, snowball, and wordnet stem/lemm tokens
    '''
    no_punct_string = rm_punctuation(input_string)
    tokens = word_tokenize(no_punct_string)
    lo_tokens = tokens_lower(tokens)
    f_lo_tokens = filter_tokens(lo_tokens)
    if stemming:
        s_f_lo_tokens = stem_tokens(porter, f_lo_tokens)
        le_s_f_lo_tokens = lemm_tokens(wordnet, s_f_lo_tokens)
        return ' '.join(le_s_f_lo_tokens)
    else:
        le_f_lo_tokens = lemm_tokens(wordnet, f_lo_tokens)
        return ' '.join(le_f_lo_tokens)

In [None]:
# this takes a few minutes, and uses about 100 MB of RAM
corpus = [nlp_pre_proc_doc(d) for d in comms['body']]

## Train classifier(s)

In [None]:
# keep a dictionary of trained classifiers for comparison
clfs = {}

### train test split

In [None]:
X_train_corp, X_test_corp, y_train, y_test = train_test_split(
        corpus, y, test_size=0.2, random_state=30, shuffle=True)

In [None]:
n_features = 5000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

In [None]:
t0 = time.time()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_corp)
print("done in %0.3fs." % (time.time() - t0))

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test_corp)

In [None]:
X_train = X_train_tfidf.toarray()
X_test = X_test_tfidf.toarray()

In [None]:
# fit the model
# at first I did 100 estimators, but 100*25 is only 2500 
#  whereas we have 5000 features in tfidf. Increase to 400
model_param = {'n_estimators': 400,
                   'max_depth': 5,
                   'max_features': 25,
                   'oob_score': True,
                   'n_jobs': -1,
                   'random_state': 30}

In [None]:
brf = RandomForestClassifier(**model_param, class_weight='balanced_subsample')
brf.fit(X_train, y_train)

In [None]:
imbrf = BalancedRandomForestClassifier(**model_param, class_weight='balanced_subsample')
imbrf.fit(X_train, y_train)

## model inspection

In [None]:
def standard_confusion_matrix(y_true, y_predict):
    """
    y_true = [1, 1, 1, 1, 1, 0, 0]

    y_predict = [1, 1, 1, 1, 0, 0, 0]

    In [1]: standard_confusion_matrix(y_true, y_predict)
    >> array([[4., 1.],
    >>       [0., 2.]])
    """
    cm = np.zeros((2,2))
    X = np.array([y_true, y_predict])
    values, counts = np.unique(X, axis=1, return_counts=True)
    for i, v in enumerate(values.T):
        cm[tuple([1, 1] - v)] = counts[i]
    return cm.T.astype(int)

# from the lecture
# Just handy function to make our confusion matrix pretty 
def plot_confusion_matrix(cm, # confusion matrix
                          classes_x, # test to describe what the output of the classes may be (commonly 1 or 0)
                          classes_y,
                          normalize=False, 
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes_x))
    plt.xticks(tick_marks, classes_x, rotation=45)
    plt.yticks(tick_marks, classes_y)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i,  format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Predicted label')
    plt.xlabel('True label')

In [None]:
def plot_roc_nofit(ax, X_test, y_test, clf, clf_name, **kwargs):
    y_prob = np.zeros((len(y_test),2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    # Predict probabilities, not classes
    y_prob = clf.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    if len(ax.lines) == 0:
        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    ax.plot(fpr, tpr, lw=1, label='%s (area = %0.2f)' % (clf_name, roc_auc))
    mean_tpr /= 1
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
#     plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right", )

In [None]:
# include the model for comparisons
clfs['Balanced_RF'] = brf

In [None]:
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(brf, X_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
clfs.items()

In [None]:
# rnd_smp = np.random.random_sample(len(X)) < 0.
#fig, ax = plt.subplots(1, figsize=(6, 5))
#classifier_labels = ['Random_Forest (RF)', 'Balanced_RF']
#classifiers = {'Random_Forest (RF)': rf, 'Balanced_RF': brf}
#for label, clf in clfs.items():
    #plot_roc_nofit(ax, X_test, y_test, clf, label)
    #multi_class='ovo'
>>> import numpy as np
>>> from sklearn.metrics import roc_auc_score
>>> y_true = np.array([0, 0, 1, 1])
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
>>> roc_auc_score(y_true, y_scores)

In [None]:
brf.predict_proba(X_test)

In [None]:
troll_comments = comms[comms['troll?']==1]

In [None]:
len(troll_comments)

In [None]:
# let's get the ids for all the troll_comments, then retrieve all the replies to it from Pushshift
troll_comments['id']

In [None]:
api = PushshiftAPI()

In [None]:
gen = api.search_comments(parent_id='t1_dr3b6ce,t1_cpe9ci5')

In [None]:
next(gen).d_

In [None]:
gen = api.search_comments(id='cax4ng0,cax1t9x')
next(gen).d_, next(gen).d_

#### so, let's recap the tomfoolery between praw and psaw
0) psaw returns comments from trolls, praw does not ... true? FALSE, psaw doesn't give results when parent_id is from a troll
1) psaw returns nothing when searching on threads with comments from trolls ... true? mostly. I believe it's not 100%, but e.g. https://api.pushshift.io/reddit/submission/comment_ids/4rdu5x returns []
2) re: 1, reddit does, but does not include any responses to the trolls ... true?

#### TODO
* classify on pca / increase the number of maximum features per tree
* data completeness
  - get nest_level for everything
  - do we have all comments multiple levels below troll comments?
  - authors, subreddits, link- and comment-karma
  - permalink
* one-hot encode the subreddit to add to the tfidf
* add user activity profile to the tfidf
* start with id, (comment)
  - is the comment trollish? proba > threshold
  - if yes, then 
    - classify all other comments from the user (max=1000?)
    - for each trollish comment, recurse on other trollish comments in its thread (max=1000?)
  - given all the comments (including non-trollish from threads), view the author creation date and test for spike


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    troll_comments = comms[comms['troll?']==1]
    print(troll_comments[troll_comments['subreddit']=='politics'])
# comms.tail().iloc[0, :]

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    not_troll_comments = comms[comms['troll?']!=1]
    print(not_troll_comments[not_troll_comments['subreddit_id']=='t5_2cneq'].head())

In [None]:
y = comms['class_label'].values
np.unique(y, return_counts=True)

In [None]:
dict(zip(comms['subreddit_id'], comms['subreddit']))

In [None]:
# np.unique(comms['subreddit_id'] + '_' + comms['subreddit'], return_counts=True)
comms['subreddit_id']

#### join the [71.6 million usernames](https://old.reddit.com/r/pushshift/search?q=karma&restrict_sr=on&sort=relevance&t=all) to the user activity dataframe so we can efficiently load the link and comment karma of all these users into mongo
```bash
comm <(\
    sed '1d' user_activity_dataframe.csv | cut -d, -f2 | sort -u\
    ) \
    <(\
    cut -d, -f1 user_activity_dataframe_reddit_accounts.csv | sort -u\
    ) | 
awk -F"\t" '{print NF;}' | sort | uniq -c
```
``` 
    56 1
   277 3
```

In [None]:
# get the user link- and comment-karma from reddit praw
for item in reddit.redditor("Kevin_Milner").downvoted():
    print(item.id)
    break

In [None]:
# reddit.redditor("spez")
reddit.user.me()

In [None]:
reddit.redditor("spez")

In [None]:
str(reddit.redditor(fullname="t2_z919g"))

In [None]:
reddit.subreddit(display_name="t5_2r99w").subreddit

In [None]:
gen = reddit.subreddits.search("t5_2r99w,t5_2r99w")

# assume you have a Reddit instance bound to variable `reddit`
# str(reddit.submission(id="39zje0,").subreddit)
# print(submission.title) # to make it non-lazy
# pprint.pprint(vars(submission))

In [None]:
next(gen)

In [None]:
next(gen)

In [None]:
gen = reddit.subreddits.search("t5_2r99w,t5_2r99w")
next(gen), next(gen)

In [None]:
comment = reddit.comment(id="dxolpyc")
# assume you have a Reddit instance bound to variable `reddit`
# str(comment)
# print(submission.title) # to make it non-lazy


In [None]:
comment.depth

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(comms[[not value for value in comms['nest_level'].isna()]].head())
    for i, row in enumerate(comms[['nest_level', 'depth']].values):
        print(i, all([np.isnan(val) for val in row]))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(comms[np.logical_and(np.isnan(comms['nest_level']), np.isnan(comms['depth']))])

#### For the purposes of combining the depth and the nest_level, 750 comments have neither a depth nor a nest_level. They are all from trolls. With a few dozen exceptions, all have parent_ids starting with t3, meaning they have depth=1

In [None]:
# oops, parent_of_troll isn't working -- FIXED
np.logical_and(comms['troll?']==1, comms['parent_of_troll?']==1).sum()

## TODONE
* what is the human readable subreddit given the fullname, eg. t5_2r99w (DONE)
```python
gen = reddit.subreddits.search("t5_2r99w,t5_2r99w")
next(gen), next(gen)
```
```
(Subreddit(display_name='whisky'),
 Subreddit(display_name='BaseballbytheNumbers'))
```
* get user link and comment karma (partially done - https://old.reddit.com/r/pushshift/comments/9i8s23/dataset_metadata_for_69_million_reddit_users_in/)
 - still missing karma for 623 userids
```bash
$ wc -l data/user_activity_dataframe*
  3826 data/user_activity_dataframe.csv
  3203 data/user_activity_dataframe_RA_2018-09.csv
  3199 data/user_activity_dataframe_reddit_accounts.csv
 10228 total
```
* how to get the permalink for each comment (DONE)
```python
comment = reddit.comment(id="dxolpyc")
comment.permalink
```
```
'/r/redditdev/comments/8dmv8z/is_there_no_distinguish_method_for_comments_in/dxolpyc/'
```
* what is the author given its id? e.g. t2_105z8m (DONE)
```python
str(reddit.redditor(fullname="t2_z919g"))
```
```
'BlackToLive'
```
* is their greater controversiality in troll comments than others? (DONE)
  - yes, slightly -- 4% to 3%
* retrieve missing comments
  - parents of trolls (missing 695, ~50%) (DONE)
  - replies to trolls (missing ALL of them, unless the reply is from a troll) (DONE)