In [1]:
import os
import json
import numpy as np
import pandas as pd
import networkx as nx

from pprint import pprint
from datetime import datetime
from collections import Counter

### Data Retrieval Functions

In [2]:
# read in comment dictionary
def get_comment_dictionary(body=False):
    # read in comment dictionary from file
    if body:
        comment_dict_dir = '../../data/final/dictionaries/comments.json'
    else:
        comment_dict_dir = '../../data/final/dictionaries/comments-no-body.json'
        
    with open(comment_dict_dir,'r') as f:
            # load data
            file_data = json.load(f)
    
    return file_data   

In [3]:
# read in author dictionary
def get_author_dictionary():
    # read in comment dictionary from file
    comment_dict_dir = '../../data/final/dictionaries/authors.json'
        
    with open(comment_dict_dir,'r') as f:
            # load data
            file_data = json.load(f)
    
    return file_data

In [4]:
# read in submissions dictionary
def get_submission_dictionary(body=False):
    # read in comment dictionary from file
    if body:
        submission_dict_dir = '../../data/final/dictionaries/submissions.json'
    else:
        submission_dict_dir = '../../data/final/dictionaries/submissions-no-body.json'
          
    with open(submission_dict_dir,'r') as f:
            # load data
            file_data = json.load(f)
    
    return file_data

### Get dictionaries and preview contents

In [5]:
# create comment dict instance
comments = get_comment_dictionary()
len(comments)

14105159

In [6]:
# create author dict instance
authors = get_author_dictionary()
len(authors)

505476

In [7]:
# create instance of submission dict
submissions = get_submission_dictionary()
len(submissions)

1052598

In [8]:
authors.items()[1:5]

[(u'seven_71',
  {u'comment_id': [u'dplzxhh'],
   u'submission_id': [u'7553hy',
    u'75tz4x',
    u'78a2h5',
    u'7bze5x',
    u'7d1m8l',
    u'71mh1d']}),
 (u'umbertostrange',
  {u'comment_id': [u'dq2m1ur',
    u'dq2ma06',
    u'dq2me8p',
    u'dq2ny6l',
    u'dq37pt6',
    u'dpmgpuh',
    u'dpmh9as',
    u'dppjh42',
    u'dppjj83',
    u'dppn98b',
    u'dpppl0x',
    u'dpppufm',
    u'dppq4x8',
    u'dppqewp',
    u'dppqq74',
    u'dppquo9',
    u'dppr8qt',
    u'dpprobr',
    u'dpq5g96',
    u'dpr0n2a',
    u'dpr0r51',
    u'dpsesjx',
    u'dpseu1t',
    u'dpsh22b',
    u'dpt1d29',
    u'dpt1gcl',
    u'dpt1t9z',
    u'dpt2maa',
    u'dpubi3g',
    u'dpubz9x',
    u'dpvhqw1',
    u'dpvi3cq',
    u'dpvi6s4',
    u'dpvikk3',
    u'dpviq85',
    u'dpw6k8e',
    u'dpw6lrq',
    u'dpwsqq1',
    u'dpxpo2n',
    u'dpxppru',
    u'dpxqect',
    u'dpymhm6',
    u'dpzahle',
    u'dq0kmis',
    u'dq18s6f',
    u'dq192u2',
    u'dq198kl',
    u'dq48ywu',
    u'dq4c8z3',
    u'dqdghsn',
    u'

In [9]:
comments.items()[:5]

[(u'dt6g6rt',
  {u'author_id': u'ensoul',
   u'link_id': u't3_7sl6wo',
   u'parent_id': u't1_dt6dq5m',
   u'score': 1,
   u'subreddit': u'ethtrader',
   u'time': u'2018-01-25 02:48'}),
 (u'cfm9pti',
  {u'author_id': u'coins1204',
   u'link_id': u't3_1ynczs',
   u'parent_id': u't1_cfm1ylx',
   u'score': u'7',
   u'subreddit': u'cryptocurrency'}),
 (u'c88rr1t',
  {u'author_id': u'stormsbrewing',
   u'link_id': u't3_17ti07',
   u'parent_id': u't1_c88qms2',
   u'score': u'1',
   u'subreddit': u'bitcoin'}),
 (u'cqzmgta',
  {u'author_id': u'Sukrim',
   u'link_id': u't3_34zj70',
   u'parent_id': u't1_cqzldn7',
   u'score': u'2',
   u'subreddit': u'bitcoin'}),
 (u'cn9anjy',
  {u'author_id': u'anti09',
   u'link_id': u't3_2qpozx',
   u'parent_id': u't1_cn9514j',
   u'score': u'0',
   u'subreddit': u'bitcoin'})]

In [10]:
submissions.items()[:5]

[(u'4jkdhy',
  {u'author': [u'munteanualex_ro'],
   u'body': [u'Original thread: https://redd.it/4iwfsx\nAfter the execution, the malware send some data to bnaf12.no-ip.biz.  This is the attacker website http://paulie[.]rocks/\n\nhttp://imgur.com/pPinOgK\n\nhttps://bitcointalk.org/index.php?action=profile;u=838056\nhttps://www.reddit.com/user/PaulieGolding\nhttps://bazaarbay.org/@pauliegolding\nhttps://www.bustabit.com/user/PaulieGolding'],
   u'comment_id': [u'd37fjem',
    u'd37gxkg',
    u'd37k317',
    u'd37kbpv',
    u'd37lxoy',
    u'd37lz3g',
    u'd38qxcv',
    u'd39fzj8'],
   u'subreddit': [u'btc'],
   u'time': [u'2016-05-16 09:28:23'],
   u'title': [u'I found some information about the guy who stole $10,000 in cryptos']}),
 (u'4l0fxz',
  {u'author': [u'heliumcraft'],
   u'body': [u''],
   u'comment_id': [u'd3jx5z0'],
   u'subreddit': [u'ethereum'],
   u'time': [u'2016-05-25 16:18:15'],
   u'title': [u'EtherSim 0.4.0 - Ethereum simulator for testing and development purposes']}

In [11]:
# create DIRECTED graph of comment_poster from edge_list
def get_edge_list(score=False, multi=False):
    if score:
        filename='commenter-poster.txt'
    else:
        filename='commenter-poster_no-score.txt'
    # read in edge list of commenters and submitters
    if multi:
        return nx.read_edgelist('../../data/final/graph_tools/' + filename, create_using=nx.MultiDiGraph())
    else:
        return nx.read_edgelist('../../data/final/graph_tools/' + filename, create_using=nx.DiGraph())

### Reddit Network as Directed Graph

In [12]:
# read in graph of edge-list
G = get_edge_list(score=False, multi=False)

In [13]:
# drop specific nodes
drop_nodes = ['AutoModerator','[deleted]']
G.remove_nodes_from(drop_nodes)

In [14]:
# show number of edges (replies tied to submissions)
G.number_of_edges()

5011529

In [15]:
# show number of nodes (ids of commenters or submitters)
G.number_of_nodes()

463536

In [16]:
# create dictionary of nodes as keys and PageRank score as value
pr_dict = nx.algorithms.pagerank_scipy(G)

In [17]:
# ensure matches previous number of nodes
len(pr_dict)

463536

In [18]:
# create sorted list of tuples of nodes and their PageRank values, sorted by descending PR score
pr_sorted = [(key, pr_dict[key]) for key in sorted(pr_dict.iterkeys(), key=(lambda key: pr_dict[key]), reverse=True)]

In [19]:
# ensure matches previous number of nodes
len(pr_sorted)

463536

In [20]:
# show top 5 values
pr_sorted[:5]

[(u'BashCo', 0.0017785651779523015),
 (u'Fast0rer', 0.0017214375424149978),
 (u'AnalyzerX7', 0.0017114932301918408),
 (u'Gabriel-Lewis', 0.001635383832111839),
 (u'LeeWallis', 0.0016163610858741888)]

In [46]:
# get list of top 100 'best' nodes per PageRank
top_nodes_pr = [entry[0] for entry in pr_sorted]
top_nodes_pr[:5]

[u'BashCo', u'Fast0rer', u'AnalyzerX7', u'Gabriel-Lewis', u'LeeWallis']

### Reddit Network as Directed Multi-Graph

In [22]:
# read in graph of edge-list
G_multi = get_edge_list(score=False, multi=True)

In [23]:
# drop specific nodes
drop_nodes = ['AutoModerator','[deleted]']
G_multi.remove_nodes_from(drop_nodes)

In [24]:
# show number of edges (replies tied to submissions)
G_multi.number_of_edges()

9923573

In [25]:
# show number of nodes (ids of commenters or submitters)
G_multi.number_of_nodes()

463536

In [26]:
# create dictionary of nodes as keys and PageRank score as value
pr_dict_multi = nx.algorithms.pagerank_scipy(G_multi)

In [27]:
# ensure matches previous number of nodes
len(pr_dict)

463536

In [28]:
# create sorted list of tuples of nodes and their PageRank values, sorted by descending PR score
pr_sorted_multi = [(key, pr_dict_multi[key]) for key in sorted(pr_dict_multi.iterkeys(), 
                                                         key=(lambda key: pr_dict_multi[key]), reverse=True)]

In [29]:
# ensure matches previous number of nodes
len(pr_sorted_multi)

463536

In [30]:
# show top 5 values
pr_sorted_multi[:5]

[(u'AnalyzerX7', 0.004304594902323775),
 (u'Fast0rer', 0.0027710972514412347),
 (u'Egon_1', 0.0025582695732947894),
 (u'LeeWallis', 0.0024388230345278334),
 (u'EthTrader_Mod', 0.002371139272686851)]

In [47]:
# get list of top 100 'best' nodes per PageRank
top_nodes_pr_multi = [entry[0] for entry in pr_sorted_multi]
top_nodes_pr_multi[:5]

[u'AnalyzerX7', u'Fast0rer', u'Egon_1', u'LeeWallis', u'EthTrader_Mod']

### Getting Counts of Subreddits per Author

In [33]:
test_node = top_nodes_pr[0]
test_node

u'BashCo'

In [34]:
def get_author_subreddits_submitted(author_id):
    # get counts of posts and submissions in each subreddit
    
    # get data for author
    try:
        author_data = authors[author_id]
    except:
        raise ValueError('{} not found in author dictionary'.format(author_id))
        return None
    
    # handle error if data is list, not dictionary
    if type(author_data) == list:
        return Counter()
    
    # initialize containers
    sub_list = []
    
    # for all submissions author posted
    for submission_id in author_data['submission_id']:
        # append the subreddit to the list
        try:
            sub = submissions[submission_id]['subreddit'][0]
            sub_list.append(sub)
        except:
            continue
        
    return Counter(sub_list)

In [35]:
sub_dict = get_author_subreddits_submitted(test_node)
sub_dict

Counter({u'bitcoin': 121, u'bitcoinbeginners': 1})

In [36]:
def get_author_subreddits_commented(author_id):
    # get counts of posts and submissions in each subreddit
    
    # get data for author
    try:
        author_data = authors[author_id]
    except:
        raise ValueError('{} not found in author dictionary'.format(author_id))
    
    # handle error if data is list, not dictionary
    if type(author_data) == list:
        return Counter()

    # initialize containers
    com_list = []
        
    # for all comments author posted
    for comment_id in author_data['comment_id']:
        # append the subreddit to the list
        try:
            com = comments[comment_id]['subreddit']
            com_list.append(com)
        except:
            continue
    
    return Counter(com_list)

In [37]:
com_dict = get_author_subreddits_commented(test_node)
com_dict

Counter({u'bitcoin': 14455,
         u'bitcoinbeginners': 178,
         u'bitcoindiscussion': 9,
         u'bitcoinmarkets': 623,
         u'bitcoinmining': 2,
         u'btc': 245,
         u'cryptocurrency': 38,
         u'cryptomarkets': 9})

In [38]:
def get_author_subreddits_total(author_id):
    # get counts of posts and submissions in each subreddit
    
    # get data for author
    try:
        author_data = authors[author_id]
    except:
        raise ValueError('{} not found in author dictionary'.format(author_id))
    
    # initialize containers
    com_list = []
    sub_list = []
    
    # for all submissions author posted
    for submission_id in author_data['submission_id']:
        # append the subreddit to the list
        try:
            sub = submissions[submission_id]['subreddit'][0]
            sub_list.append(sub)
        except:
            continue
        
    # for all comments author posted
    for comment_id in author_data['comment_id']:
        # append the subreddit to the list
        try:
            com = comments[comment_id]['subreddit']
            com_list.append(com)
        except:
            continue
    
    return Counter(com_list) + Counter(sub_list)

In [39]:
tot_dict = get_author_subreddits_total(test_node)
tot_dict

Counter({u'bitcoin': 14576,
         u'bitcoinbeginners': 179,
         u'bitcoindiscussion': 9,
         u'bitcoinmarkets': 623,
         u'bitcoinmining': 2,
         u'btc': 245,
         u'cryptocurrency': 38,
         u'cryptomarkets': 9})

### Write Out Stats on Top PR Nodes

In [40]:
def write_top_pr_subs(n=100, multi=False):
    # write to disc the counts of the top n people's subreddits posted in
    
    # get proper PR dict
    if multi:
        top = top_nodes_pr[:n]
    else:
        top = top_nodes_pr_multi[:n]
    
    # create dictionary and DF for writing out data
    out_count = Counter()
    for auth in top:
        out_count += get_author_subreddits_submitted(auth)

    # create DF to write out
    df = pd.Series(out_count)
    if multi:
        df.to_csv('../../data/final/output/top_{}_PR_subs-multi.csv'.format(n))
    else:
        df.to_csv('../../data/final/output/top_{}_PR_subs.csv'.format(n))

In [41]:
def write_top_pr_coms(n=100, multi=False):
    # write to disc the counts of the top n people's subreddits posted in
    
    # get proper PR dict
    if multi:
        top = top_nodes_pr[:n]
    else:
        top = top_nodes_pr_multi[:n]
    
    # create dictionary and DF for writing out data
    out_count = Counter()
    for auth in top:
        out_count += get_author_subreddits_commented(auth)
    
    # create DF to write out
    df = pd.Series(out_count)
    if multi:
        df.to_csv('../../data/final/output/top_{}_PR_coms-multi.csv'.format(n))
    else:
        df.to_csv('../../data/final/output/top_{}_PR_coms.csv'.format(n))

In [42]:
def write_total_sub_counts():
    # write to disc the counts of the top n people's subreddits posted in
    
    # create dictionary and DF for writing out data
    out_count = Counter()
    for i, auth in enumerate(authors.keys()):
        if i % 50000 == 0: print('parsing author {}...'.format(i))
        out_count += get_author_subreddits_submitted(auth)
    
    # create DF to write out
    df = pd.Series(out_count)
    df.to_csv('../../data/final/output/Total_subs.csv')

In [43]:
def write_total_com_counts():
    # write to disc the counts of the top n people's subreddits posted in
    
    # create dictionary and DF for writing out data
    out_count = Counter()
    for i, auth in enumerate(authors.keys()):
        if i % 50000 == 0: print('parsing author {}...'.format(i))
        out_count += get_author_subreddits_commented(auth)
    
    # create DF to write out
    df = pd.Series(out_count)
    df.to_csv('../../data/final/output/Total_coms.csv')

### Writing Out Top N-PageRank User's Subreddits

In [49]:
# write top 100 people's subs
df_100_single = write_top_pr_subs(n=100, multi=True)
df_100_multi = write_top_pr_subs(n=100,multi=False)

In [50]:
# write top 500 people's subs
write_top_pr_subs(n=500, multi=True)
write_top_pr_subs(n=500, multi=False)

In [51]:
# write top 1000 people's subs
write_top_pr_subs(n=1000, multi=True)
write_top_pr_subs(n=1000, multi=False)

### Writing Out Top N-PageRank User's Comments

In [52]:
# write top 100 people's subs
write_top_pr_coms(n=100, multi=True)
write_top_pr_coms(n=100, multi=False)

In [53]:
# write top 500 people's subs
write_top_pr_coms(n=500, multi=True)
write_top_pr_coms(n=500, multi=False)

In [54]:
# write top 1000 people's subs
write_top_pr_coms(n=1000, multi=True)
write_top_pr_coms(n=1000, multi=False)

In [55]:
write_total_sub_counts()

parsing author 0...
parsing author 50000...
parsing author 100000...
parsing author 150000...
parsing author 200000...
parsing author 250000...
parsing author 300000...
parsing author 350000...
parsing author 400000...
parsing author 450000...
parsing author 500000...


In [56]:
write_total_com_counts()

parsing author 0...
parsing author 50000...
parsing author 100000...
parsing author 150000...
parsing author 200000...
parsing author 250000...
parsing author 300000...
parsing author 350000...
parsing author 400000...
parsing author 450000...
parsing author 500000...
