In [None]:
import jsonlist

# Load in the massive dataset
data = jsonlist.load_file('cmv_20161111.jsonlist')

In [None]:
print(data[0].keys())
print()
print(data[0]['comments'][0].keys())

In [None]:
import networkx as nx

gAll = {}

# dump every single post into its own graph tree
for post_i in range(len(data)):
    g = nx.DiGraph()
    userOP = data[post_i]['author']
    userResponders = {}
    userToplevels = []
    commentInfo = {}
    for comment_i in range(len(data[post_i]['comments'])):
        author = '[deleted]'
        if 'author' in data[post_i]['comments'][comment_i]:
            author = data[post_i]['comments'][comment_i]['author']
        parentID = data[post_i]['comments'][comment_i]['parent_id']
        currentID = data[post_i]['comments'][comment_i]['name']
        g.add_edge(parentID, currentID)
        g.nodes[currentID]['comment_author'] = author
        commentInfo[currentID] = data[post_i]['comments'][comment_i]
        if author!='[deleted]' and author!='DeltaBot' and author!='AutoModerator':
            if author not in userResponders:
                userResponders[author] = []
            userResponders[author].append(currentID)
            if parentID==data[post_i]['name']:
                userToplevels.append(author)
    gAll[data[post_i]['name']] = {
        'post_graph': g,
        'user_op': userOP,
        'user_toplevels': list(set(userToplevels)),
        'user_responders': userResponders,
        'comment_data': commentInfo,
        'raw_data': data[post_i],
    }

In [None]:
# test out the graph building
postID = list(gAll.keys())[3864]
postG = gAll[postID]['post_graph']
# print(postG.nodes.data())
nx.draw_kamada_kawai(
    postG, 
#     node_color=[
#         d['comment_author'] if 'comment_author' in d else gAll[postID]['user_op']
#         for (n,d) in postG.nodes.data()
#     ]
)
print(postID)
print(gAll[postID]['user_op'])
print(gAll[postID]['user_toplevels'])
print(gAll[postID]['user_responders'])

In [None]:
# count how many branches a user has touched
# path metric:
# count of how many unique paths are needed for all of the user's posts to be reached
def branch_count_paths(username, postID):
    # retrieve graph we are working with
    g = gAll[postID]['post_graph']
    # calculate path to each of the poster's comments
    commentPaths = {}
    for cID in gAll[postID]['user_responders'][username]:
        commentPaths[cID] = nx.shortest_path(g, postID, cID)
    # starting from longest path, remove all comments along that path (since theyre not unique)
    pathCount = 0
    while len(commentPaths) > 0:
#         print(commentPaths)
        deepestCID = sorted(
            [(k, len(commentPaths[k])) for k in commentPaths],
            key=(lambda x: x[1]),
        )
        deepestPath = commentPaths[deepestCID[-1][0]]
        for cID in deepestPath:
            commentPaths.pop(cID, None)
        pathCount += 1
    return pathCount

username = "ralpher313"
postID = "t3_5c8xdc"
print(branch_count_paths(username, postID))

In [None]:
# topic metric: 
# count of how many distinct top-level-comments the user has interacted with at any level
def branch_count_tlc(username, postID):
    # retrieve graph we are working with
    g = gAll[postID]['post_graph']
    # calculate TLC for each of the poster's comments
    commentTLC = {}
    for cID in gAll[postID]['user_responders'][username]:
        commentTLC[cID] = nx.shortest_path(g, postID, cID)
        # identify the top-level-comment this path interacts with
        commentTLC[cID] = commentTLC[cID][1]
    return len(set(commentTLC.values()))
    
username = "ralpher313"
postID = "t3_5c8xdc"
print(branch_count_tlc(username, postID))

In [None]:
# get branch count for responders
distribCount = {} # map from [number of branches]: [users who have had that number]
userCount = {}    # map from [usernames]: [branch distribution they've done]
for postID in gAll.keys():
    for u in gAll[postID]['user_toplevels']: # user_responders , user_toplevels
        if u!=gAll[postID]['user_op']:
            count = branch_count_tlc(u, postID)
            if count not in distribCount:
                distribCount[count] = []
            distribCount[count].append(u)
            if u not in userCount:
                userCount[u] = []
            userCount[u].append(count)

In [None]:
import math
import matplotlib.pyplot as plt

t = [(k, len(set(distribCount[k])), set(distribCount[k])) for k in distribCount.keys()]
t = [(a,b,c) for (a,b,c) in t if b!=0]
t = sorted(t, key=lambda x: x[0])
print([(a,b) for (a,b,c) in t])

plt.scatter(
    [b for (b, _, _) in t], 
    [math.log(fq) for (_, fq, _) in t], 
)
plt.xlabel('# of branches (X)')
plt.ylabel('log(# of users who touched X branches)')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

t = [(k, sum(userCount[k]), len(userCount[k]), max(userCount[k])) for k in userCount.keys()]
t = sorted(t, key=lambda x: x[2], reverse=True)
print('(username, avg branches/post, max branches/post, #posts touched)')
for i in ([(a,b/c,d,c) for (a,b,c,d) in t])[:10]:
    print(i)

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [math.log(sumB/numB) for (_, sumB, numB, _) in t], 
)
plt.xlabel('# of posts commented on')
plt.ylabel('log(Average number of branches touched)')
plt.show()

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [maxB for (_, _, _, maxB) in t], 
)
plt.xlabel('# of posts commented on')
plt.ylabel('Max number of branches touched')
plt.show()

In [None]:
import re

# get comment contents for responders
commentCount = []
for postID in gAll.keys():
    for u in gAll[postID]['user_toplevels']: # user_responders , user_toplevels
        if u!=gAll[postID]['user_op']:
            userNumBranches = branch_count_tlc(u, postID)
            userNumComments = len(gAll[postID]['user_responders'][u])
            for commentID in gAll[postID]['user_responders'][u]:
                contents = gAll[postID]['comment_data'][commentID]['body']
                hasQuestionMark = bool(re.search(r'\?', contents))
                if gAll[postID]['user_op'] in gAll[postID]['user_responders']:
                    respondingToOP = (
                        gAll[postID]['comment_data'][commentID]['parent_id'] 
                        in gAll[postID]['user_responders'][
                            gAll[postID]['user_op']
                        ]
                    )
                else:
                    respondingToOP = False
                commentCount.append((
                    postID,
                    commentID, 
                    u,
                    userNumComments, 
                    userNumBranches, 
                    contents,
                    hasQuestionMark,
                    respondingToOP,
                ))
# tSelect = [e for e in commentCount if e[4]>1 and e[3]>e[4]][0]
# [e for e in commentCount if e[2]==tSelect[2] and e[0]==tSelect[0]]

In [None]:
tComments = {}
tQuestions = {}
tOPResponses = {}
for c in commentCount:
    p = (c[0], c[2], c[3], c[4])
    if p not in tComments:
        tComments[p] = []
    tComments[p].append(c[5])
    if p not in tQuestions:
        tQuestions[p] = []
    tQuestions[p].append(c[6])
    if p not in tOPResponses:
        tOPResponses[p] = []
    tOPResponses[p].append(c[7])

for k in tComments:
    tComments[k] = [len(c) for c in tComments[k]]
tComments = [(k[2], k[3], tComments[k]) for k in tComments]

for k in tQuestions:
    tQuestions[k] = [(1 if c else 0) for c in tQuestions[k]]
tQuestions = [(k[2], k[3], tQuestions[k]) for k in tQuestions]
tQuestions = [(k[0], k[1], sum(k[2])/len(k[2])) for k in tQuestions]

for k in tOPResponses:
    tOPResponses[k] = [(1 if c else 0) for c in tOPResponses[k]]
tOPResponses = [(k[2], k[3], tOPResponses[k]) for k in tOPResponses]
tOPResponses = [(k[0], k[1], sum(k[2])/len(k[2])) for k in tOPResponses]

In [None]:
import math
import matplotlib.pyplot as plt

plt.scatter(
    [numc for (numc, numb, lens) in tComments],
    [sum(lens)/len(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of comments in post X, per user')
plt.ylabel('avg len of comments in post X')
plt.show()

plt.scatter(
    [numc for (numc, numb, lens) in tComments],
    [max(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of comments in post X, per user')
plt.ylabel('max len of comments in post X')
plt.show()

plt.scatter(
    [numc for (numc, numb, lens) in tComments],
    [min(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of comments in post X, per user')
plt.ylabel('min len of comments in post X')
plt.show()

In [None]:
tLengthComments = []
for c in sorted(list(set([k[0] for k in tComments]))):
    ls = [sum(k[2])/len(k[2]) for k in tComments if k[0]==c]
    tLengthComments.append( (c, sum(ls)/len(ls)) )

plt.scatter(
    [numc for (numc, _) in tLengthComments],
    [avglen for (_, avglen) in tLengthComments]
)
plt.xlabel('# of comments')
plt.ylabel('average comment length overall')
plt.show()

In [None]:
tQuestionsComments = []
for c in sorted(list(set([k[0] for k in tQuestions]))):
    qfs = [k[2] for k in tQuestions if k[0]==c]
    tQuestionsComments.append( (c, sum(qfs)/len(qfs)) )

plt.scatter(
    [numc for (numc, _) in tQuestionsComments],
    [qf for (_, qf) in tQuestionsComments]
)
plt.xlabel('# of comments')
plt.ylabel('fraction with Qs')
plt.show()

In [None]:
tOPResponsesComments = []
for c in sorted(list(set([k[0] for k in tOPResponses]))):
    rfs = [k[2] for k in tOPResponses if k[0]==c]
    tOPResponsesComments.append( (c, sum(rfs)/len(rfs)) )

plt.scatter(
    [numc for (numc, _) in tOPResponsesComments],
    [rf for (_, rf) in tOPResponsesComments]
)
plt.xlabel('# of comments')
plt.ylabel('fraction with OP responses')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

plt.scatter(
    [numb for (numc, numb, lens) in tComments],
    [sum(lens)/len(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of branches in post X, per user')
plt.ylabel('avg len of comments in post X')
plt.show()

plt.scatter(
    [numb for (numc, numb, lens) in tComments],
    [max(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of branches in post X, per user')
plt.ylabel('max len of comments in post X')
plt.show()

plt.scatter(
    [numb for (numc, numb, lens) in tComments],
    [min(lens) for (numc, numb, lens) in tComments]
)
plt.xlabel('# of branches in post X, per user')
plt.ylabel('min len of comments in post X')
plt.show()

In [None]:
tLengthBranches = []
for c in sorted(list(set([k[1] for k in tComments]))):
    ls = [sum(k[2])/len(k[2]) for k in tComments if k[1]==c]
    tLengthBranches.append( (c, sum(ls)/len(ls)) )

plt.scatter(
    [numb for (numb, _) in tLengthBranches],
    [avglen for (_, avglen) in tLengthBranches]
)
plt.xlabel('# of branches')
plt.ylabel('average comment length overall')
plt.show()

In [None]:
tQuestionsBranches = []
for c in sorted(list(set([k[1] for k in tQuestions]))):
    qfs = [k[2] for k in tQuestions if k[1]==c]
    tQuestionsBranches.append( (c, sum(qfs)/len(qfs)) )

plt.scatter(
    [numb for (numb, _) in tQuestionsBranches],
    [qf for (_, qf) in tQuestionsBranches]
)
plt.xlabel('# of branches')
plt.ylabel('fraction with Qs')
plt.show()

In [None]:
tOPResponsesBranches = []
for c in sorted(list(set([k[1] for k in tOPResponses]))):
    rfs = [k[2] for k in tOPResponses if k[1]==c]
    tOPResponsesBranches.append( (c, sum(rfs)/len(rfs)) )

plt.scatter(
    [numb for (numb, _) in tOPResponsesBranches],
    [rf for (_, rf) in tOPResponsesBranches]
)
plt.xlabel('# of branches')
plt.ylabel('fraction with OP responses')
plt.show()

In [None]:
import re

# OP's notable contents

print('-> title')
print(data[0]['title'])
print()

print('-> author')
print(data[0]['author'])
print()

print('-> selftext')
# Remove the auto-added CMV post footer
print(re.split(r'\n\_\_\_\_\_', data[0]['selftext'])[0])
print()

print('-> created, link_flair_text, permalink')
print(data[0]['created'], data[0]['link_flair_text'], data[0]['permalink'])
print()

print('-> score, ups, downs')
print(data[0]['score'], data[0]['ups'], data[0]['downs'])
print()