In [None]:
import jsonlist

# Load in the massive dataset
data = jsonlist.load_file('cmv_20161111.jsonlist')

In [None]:
print(data[0].keys())
print()
print(data[0]['comments'][0].keys())

In [None]:
import networkx as nx

gAll = {}

# dump every single post into its own graph tree
for post_i in range(len(data)):
    g = nx.DiGraph()
    userOP = data[post_i]['author']
    userResponders = {}
    userToplevels = []
    for comment_i in range(len(data[post_i]['comments'])):
        if 'author' in data[post_i]['comments'][comment_i]:
            author = data[post_i]['comments'][comment_i]['author']
            parentID = data[post_i]['comments'][comment_i]['parent_id']
            currentID = data[post_i]['comments'][comment_i]['name']
            g.add_edge(parentID, currentID)
            g.nodes[currentID]['comment_author'] = author
            if author!='[deleted]' and author!='DeltaBot' and author!='AutoModerator':
                if author not in userResponders:
                    userResponders[author] = []
                userResponders[author].append(currentID)
                if parentID==data[post_i]['name']:
                    userToplevels.append(author)
    gAll[data[post_i]['name']] = {
        'post_graph': g,
        'user_op': userOP,
        'user_toplevels': list(set(userToplevels)),
        'user_responders': userResponders,
        'raw_data': data[post_i],
    }

In [None]:
# test out the graph building
postID = list(gAll.keys())[3864]
postG = gAll[postID]['post_graph']
# print(postG.nodes.data())
nx.draw_kamada_kawai(
    postG, 
#     node_color=[
#         d['comment_author'] if 'comment_author' in d else gAll[postID]['user_op']
#         for (n,d) in postG.nodes.data()
#     ]
)
print(postID)
print(gAll[postID]['user_op'])
print(gAll[postID]['user_toplevels'])
print(gAll[postID]['user_responders'])

In [None]:
# get branch count for top-level responders
for postID in gAll.keys():
    for u in gAll[postID]['user_toplevels']: # user_toplevels
        if u!=gAll[postID]['user_op']:
            count = branch_count(u, postID)
            if count > 10:
                print(postID, u, gAll[postID]['raw_data']['num_comments'])
                break

In [None]:
# count how many branches a user has touched
# metric: count of how many unique paths are needed for all of the user's posts to be reached
def branch_count(username, postID):
    # retrieve graph we are working with
    g = gAll[postID]['post_graph']
    # calculate path to each of the poster's comments
    commentPaths = {}
    for cID in gAll[postID]['user_responders'][username]:
        commentPaths[cID] = nx.shortest_path(g, postID, cID)
    # starting from longest path, remove all comments along that path (since theyre not unique)
    pathCount = 0
    while len(commentPaths) > 0:
#         print(commentPaths)
        deepestCID = sorted(
            [(k, len(commentPaths[k])) for k in commentPaths],
            key=(lambda x: x[1]),
        )
        deepestPath = commentPaths[deepestCID[-1][0]]
        for cID in deepestPath:
            commentPaths.pop(cID, None)
        pathCount += 1
    return pathCount

username = "ralpher313"
postID = "t3_5c8xdc"
print(branch_count(username, postID))

In [None]:
# get branch count for top-level responders
distribCount = {}
userCount = {}
for postID in gAll.keys():
    for u in gAll[postID]['user_toplevels']: # user_responders , user_toplevels
        if u!=gAll[postID]['user_op']:
            count = branch_count(u, postID)
            if count not in distribCount:
                distribCount[count] = []
            distribCount[count].append(u)
            if u not in userCount:
                userCount[u] = []
            userCount[u].append(count)

In [None]:
t = [(k, len(set(distribCount[k]))-1, set(distribCount[k])) for k in distribCount.keys()]
t = [(a,b,c) for (a,b,c) in t if b!=0]
t = sorted(t, key=lambda x: x[0])
print([(a,b) for (a,b,c) in t])

In [None]:
import math
import matplotlib.pyplot as plt

plt.scatter(
    [b for (b, _) in t], 
    [math.log(fq) for (_, fq) in t], 
)
plt.xlabel('# of branches (X)')
plt.ylabel('log(# of users who touched X branches)')
plt.show()

In [None]:
t = [(k, sum(userCount[k]), len(userCount[k]), max(userCount[k])) for k in userCount.keys()]
t = sorted(t, key=lambda x: x[2], reverse=True)
for i in ([(a,b/c,d,c) for (a,b,c,d) in t])[:10]:
    print(i)

In [None]:
import math
import matplotlib.pyplot as plt

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [math.log(sumB/numB) for (_, sumB, numB, _) in t], 
)
plt.xlabel('# of posts commented on')
plt.ylabel('log(Average number of branches touched)')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [maxB for (_, _, _, maxB) in t], 
)
plt.xlabel('# of posts commented on')
plt.ylabel('Max number of branches touched')
plt.show()

In [None]:
import re

# OP's notable contents

print('-> title')
print(data[0]['title'])
print()

print('-> author')
print(data[0]['author'])
print()

print('-> selftext')
# Remove the auto-added CMV post footer
print(re.split(r'\n\_\_\_\_\_', data[0]['selftext'])[0])
print()

print('-> created, link_flair_text, permalink')
print(data[0]['created'], data[0]['link_flair_text'], data[0]['permalink'])
print()

print('-> score, ups, downs')
print(data[0]['score'], data[0]['ups'], data[0]['downs'])
print()