In [None]:
from IPython.core.interactiveshell import InteractiveShell

# Display outputs from all lines when they exist
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd

pd.set_option('max_columns', None)

In [None]:
# Read from CSV
posts_2018_df = pd.read_csv("2018_posts.csv")
comms_2018_df = pd.read_csv("2018_comments.csv")
posts_2019_df = pd.read_csv("2019_posts.csv")
comms_2019_df = pd.read_csv("2019_comments_partial.csv")

# Read from PKL
# posts_2018_df = pd.read_pickle("2018_posts.pkl")
# comms_2018_df = pd.read_pickle("2018_comments.pkl")
# posts_2019_df = pd.read_pickle("2019_posts.pkl")
# comms_2019_df = pd.read_pickle("2019_comments_partial.pkl")

print(posts_2018_df.shape, comms_2018_df.shape)
print(posts_2019_df.shape, comms_2019_df.shape)

In [None]:
posts_df = posts_2018_df.append(posts_2019_df).reset_index(drop=True)
comms_df = comms_2018_df.append(comms_2019_df).reset_index(drop=True)

print(posts_df.shape)
print(comms_df.shape)

In [None]:
posts_df[posts_df['id']=='9zgc4m'] # the ID of the first bot implementation announcement

# only include posts AFTER that announcement cutoff
posts_df = posts_df[posts_df['created_utc'] > posts_df[posts_df['id']=='9zgc4m'].iloc[0]['created_utc']].reset_index(drop=True)
print(posts_df.shape)
# and cut down the comments df to only include data for posts within that dataset, while we're at it
comms_df = comms_df[comms_df['link_id'].apply(lambda x: str(x)[3:]).isin(posts_df['id'])].reset_index(drop=True)
print(comms_df.shape)

In [None]:
# Create list of users that should always be filtered out from the counts
restricted_users_list = ['AutoModerator', 'Judgement_Bot_AITA']

In [None]:
posts_df[:2]
comms_df[:2]

In [None]:
# Get the judgement that a specific comment gives, if any
def comment_judgement(comms_df):
    # Judgement tags are in {'NTA', 'YTA', 'NAH', 'ESH', 'INFO', 'SHP' (not formal after a certain point but still commonly quoted)}
    potential_tags = ['NTA', 'YTA', 'NAH', 'ESH', 'INFO', 'SHP']
    tags = {'id':[], 'judgement_tag':[]}
    for _, comment in comms_df.iterrows():
        judgements = []
        for ptag in potential_tags:
            if (isinstance(comment['body'], str)) and (ptag in comment['body']):
                judgements.append(ptag)
        tags['id'].append(comment['id'])
        if len(judgements)==1:
            tags['judgement_tag'].append(judgements[0])
        else:
            tags['judgement_tag'].append(None)
    return pd.DataFrame(tags)

comment_judgement(comms_df[:5])
# comms_df[:5].merge(comment_judgement(comms_df[:5]), left_on='id', right_on='id')

In [None]:
# Parse all comments and pick out their judgement tags for ease of use later
# comms_df = comms_df.merge(comment_judgement(comms_df), left_on='id', right_on='id')
# comms_df.to_csv(f'backup_comms_df.csv', index=False)

comms_df = pd.read_csv(f'backup_comms_df.csv')

In [None]:
# Get the final post judgement based on top-scoring comment with a judgement
def tag_post_judgements(posts_df, comms_df):
    # VAL1: Return mapping between t3-less post ID to several attributes related to the judgements of a post
    # ATTR: Final judgement tag: determined by top-scoring comment with a judgement tag
    # ATTR: Controversialness metric: based on what r/AITAFiltered uses, the ratio between judgements of (ESH/YTA) : (NAH/NTA)
    # Let X be the number of ESH/YTA ("you're an asshole") judgements
    # Let Y be the number of NAH/NTA ("you're not an asshole") judgements
    # Then X/(X+Y)=Z is the fraction of all decided (non-SHP and non-INFO) judgements that call the OP an asshole
    # Then calculating 1+(-2)(|Z-0.5|) is a linear metric in the range [0,1], where 0 means full consensus and 1 means 50-50 judgement split
    # If there are no judgements, then the controversialness is 0 by default
    # ATTR: Judgement distribution: a tuple containing counts for (NTA, YTA, NAH, ESH, INFO) judgements
    # VAL2: Return mapping between t3-less post ID with individual usernames, the judgements they left on that post, and the comment score they had for that judgement
    tags = {'id':[], 'final_judgement':[], 'controversialness':[], 'controversialness_distrib':[]}
    userdistrib = {'id':[], 'judgement_username':[], 'judgement_decision':[], 'judgement_score':[], 'judgement_correct':[]}
    for _, post in posts_df.iterrows():
        post_id = post['id']
        tags['id'].append(post_id)
        # get the judgement
        judgements = comms_df[comms_df['link_id']==f't3_{post_id}']
        # Filter out comments from bots, OP
        judgements = judgements[judgements['author'] != post['author']]
        judgements = judgements[~ judgements['author'].isin(restricted_users_list)]
        if len(judgements)==0:
            tags['final_judgement'].append(None)
            tags['controversialness'].append(0)
            tags['controversialness_distrib'].append( (0, 0, 0, 0, 0) )
            continue
        # Get the judgement tags for all relevant comments if they're not already processed
        if 'judgement_tag' not in judgements:
            judgements = judgements.merge(comment_judgement(judgements), left_on='id', right_on='id')
        # Filter out comments without any judgement tag
        judgements = judgements[judgements['judgement_tag'].apply(lambda x: str(x).upper()==x)]
        if len(judgements)==0:
            tags['final_judgement'].append(None)
            tags['controversialness'].append(0)
            tags['controversialness_distrib'].append( (0, 0, 0, 0, 0) )
            continue
        # Sort comments by score, descending. If there are tied scores, prioritize the one posted earlier
        judgements = judgements.sort_values(by=['score', 'created_utc'], ascending=[False, True])
        post_final_judgement = judgements.iloc[0]['judgement_tag']
        # Save final judgement attributes
        tags['final_judgement'].append(post_final_judgement)
        # Read through each judgement and collect user mapping, indiv judgement type counts
        users_parsed = set()
        judgement_counts = {'NTA':0, 'YTA':0, 'NAH':0, 'ESH':0, 'INFO':0}
        for _, x in judgements.iterrows():
            # get user mapping
            username = x['author']
            judgement = x['judgement_tag']
            if username not in users_parsed:
                userdistrib['id'].append(post_id)
                userdistrib['judgement_username'].append(username)
                userdistrib['judgement_decision'].append(judgement)
                userdistrib['judgement_score'].append(x['score'])
                userdistrib['judgement_correct'].append(1 if (judgement==post_final_judgement) else 0)
                users_parsed.add(username)
            # get the judgement type
            if judgement in judgement_counts:
                judgement_counts[judgement] += 1
        # Calculate controversialness metric
        controversialness = (judgement_counts['ESH']+judgement_counts['YTA'])/(judgement_counts['ESH']+judgement_counts['YTA']+judgement_counts['NAH']+judgement_counts['NTA'])
        controversialness = abs(controversialness - 0.5)
        controversialness = controversialness * -2
        controversialness = controversialness + 1
        # Save judgement distribution attributes
        tags['controversialness'].append(controversialness)
        tags['controversialness_distrib'].append((
            judgement_counts['NTA'], 
            judgement_counts['YTA'], 
            judgement_counts['NAH'], 
            judgement_counts['ESH'], 
            judgement_counts['INFO']))
    return pd.DataFrame(tags), pd.DataFrame(userdistrib)

temp_a, temp_b = tag_post_judgements(posts_df[:5], comms_df)
temp_a
temp_b[:5]
# posts_df[:5].merge(tag_post_judgements(posts_df[:5], comms_df), left_on='id', right_on='id')

In [None]:
# TODO implement relationship between user and final judgement
# TODO implement sentiment difference between a pair of comments

In [None]:
import networkx as nx

gAll = {}

for _, post in posts_df.iterrows():
    g = nx.DiGraph()
    post_id = post['id']
    userResponders = {}
    for _, comment in comms_df[comms_df['link_id']==f't3_{post_id}'].iterrows():
        parentID = comment['parent_id']
        currentID = 't1_'+comment['id']
        author = comment['author']
        g.add_edge(parentID, currentID)
        g.nodes[currentID]['author'] = author
        if author not in userResponders:
            userResponders[author] = []
        userResponders[author].append(currentID)    
    gAll[f't3_{post_id}'] = {
        'post_graph': g,
        'user_op': post['author'],
        'user_responders': userResponders,
    }

In [None]:
temp_max = ''
maxcount = 0
for _, post in posts_df.iloc[:100].iterrows():
    commcount = post['num_comments']
    if commcount > maxcount:
        maxcount = commcount
        temp_max = 't3_'+post['id']
print(temp_max)

In [None]:
# test out the graph building
postID = 't3_9zigtr' # list(gAll.keys())[9999]
postG = gAll[postID]['post_graph']
# print(postG.nodes.data())
nx.draw_kamada_kawai(
    postG, 
#     node_color=[
#         d['comment_author'] if 'comment_author' in d else gAll[postID]['user_op']
#         for (n,d) in postG.nodes.data()
#     ]
)
print(postID)
print(gAll[postID]['user_op'])
print(gAll[postID]['user_responders'])

In [None]:
# count how many branches a user has touched
# path metric:
# count of how many unique paths are needed for all of the user's posts to be reached
def branch_count(username, postID):
    # retrieve graph we are working with
    g = gAll[postID]['post_graph']
    # calculate path to each of the poster's comments
    commentPaths = {}
    for cID in gAll[postID]['user_responders'][username]:
        try:
            commentPaths[cID] = nx.shortest_path(g, postID, cID)
        except:
            continue
    # starting from longest path, remove all comments along that path (since theyre not unique)
    pathCount = 0
    while len(commentPaths) > 0:
#         print(commentPaths)
        deepestCID = sorted(
            [(k, len(commentPaths[k])) for k in commentPaths],
            key=(lambda x: x[1]),
        )
        deepestPath = commentPaths[deepestCID[-1][0]]
        for cID in deepestPath:
            commentPaths.pop(cID, None)
        pathCount += 1
    return pathCount

username = "Killairmanable"
postID = "t3_9zigtr"
print(branch_count(username, postID))

In [None]:
import statistics

# higher = more breadth
def branchiness(username, postID, getGraph=False):
    # retrieve graph we are working with
    g = gAll[postID]['post_graph']
    # calculate path to each of the poster's comments
    commentPaths = {}
    for cID in gAll[postID]['user_responders'][username]:
        try:
            commentPaths[cID] = nx.shortest_path(g, postID, cID)
        finally:
            # print(f'{cID} nonexistent comment?')
            continue
    # starting from shortest path, build out tree of direct dependencies
    gPath = nx.DiGraph()
    nodesAdded = [postID]
    while len(commentPaths) > 0:
#         print(commentPaths)
        closestCID = sorted(
            [(k, len(commentPaths[k])) for k in commentPaths],
            key=(lambda x: x[1]),
        )
        closestPath = commentPaths[closestCID[0][0]]
        closestCID = closestPath[-1]
        topLevel = True
        for n in nodesAdded:
            if nx.has_path(g, n, closestCID):
                topLevel = False
                gPath.add_edge(n, closestCID)
                nodesAdded = [closestCID]+nodesAdded
                break
        if topLevel:
            gPath.add_edge(postID, closestCID)
            nodesAdded = [closestCID]+nodesAdded
        commentPaths.pop(closestCID, None)
    # calculate average degree
    degrees = [d for (n,d) in gPath.out_degree() if d!=0]
    if len(degrees)==0:
        # there was some hole in the graph between ALL of a user's comments and the main post...
        return None, None
    degreeAvg = sum(degrees)/len(degrees)
    degreeMed = statistics.median(degrees)
    if getGraph:
        return degreeAvg, gPath
    return degreeAvg, degreeMed

username = "Killairmanable"
postID = "t3_9zigtr"
print(branchiness(username, postID))

# Sanity check this metric?

import math
import matplotlib.pyplot as plt

t = []
for postID in gAll:
    for uname in gAll[postID]['user_responders']:
        if uname!=gAll[postID]['user_op'] and uname not in re:
            brtuple = branchiness(uname, postID)
            if brtuple[0] is not None:
                t.append((
                    len(gAll[postID]['user_responders'][uname]),
                    brtuple,
                ))

tNext = []
for l in set([i[0] for i in t]):
    v = [i[1][0] for i in t if i[0]==l]
    v = sum(v)/len(v)
    tNext.append( (l, v) )
plt.scatter(
    [x for (x, _) in tNext], 
    [y for (_, y) in tNext], 
)
plt.xlabel('# of comments')
plt.ylabel('avg(avg degree of indiv comment tree)')
plt.show()

tNext = []
for l in set([i[0] for i in t]):
    v = [i[1][1] for i in t if i[0]==l]
    v = sum(v)/len(v)
    tNext.append( (l, v) )
plt.scatter(
    [x for (x, _) in tNext], 
    [y for (_, y) in tNext], 
)
plt.xlabel('# of comments')
plt.ylabel('avg(med degree of indiv comment tree)')
plt.show()

In [None]:
# get branch count for responders
distribCount = {} # map from [number of branches]: [users who have had that number]
userCount = {}    # map from [usernames]: [branch distribution they've done]
userBCount = {}    # map from [usernames]: [branchiness distribution they've done]
for postID in gAll.keys():
    for u in gAll[postID]['user_responders']:
        if u!=gAll[postID]['user_op'] and u not in restricted_users_list:
            count = branch_count(u, postID)
            if count!=0:
                ctrlcount = branchiness(u, postID)
                if count not in distribCount:
                    distribCount[count] = []
                distribCount[count].append(u)
                if u not in userCount:
                    userCount[u] = []
                userCount[u].append(count)
                if u not in userBCount:
                    userBCount[u] = []
                userBCount[u].append(ctrlcount)

In [None]:
import math
import matplotlib.pyplot as plt

t = [(k, len(set(distribCount[k])), set(distribCount[k])) for k in distribCount.keys()]
t = [(a,b,c) for (a,b,c) in t if b!=0]
t = sorted(t, key=lambda x: x[0])
print([(a,b) for (a,b,c) in t])

plt.scatter(
    [b for (b, _, _) in t], 
    [math.log(fq) for (_, fq, _) in t], 
)
plt.xlabel('# of branches (X)')
plt.ylabel('log(# of users who touched X branches)')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

t = [(k, sum(userCount[k]), len(userCount[k]), max(userCount[k])) for k in userCount.keys()]
t = sorted(t, key=lambda x: x[2], reverse=True)
print('(username, avg branches/post, max branches/post, #posts touched)')
for i in ([(a,b/c,d,c) for (a,b,c,d) in t])[:10]:
    print(i)

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [math.log(sumB/numB) for (_, sumB, numB, _) in t],
    alpha=0.05
)
plt.xlabel('# of posts commented on')
plt.ylabel('log(Average number of branches touched)')
plt.show()

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [maxB for (_, _, _, maxB) in t],
    alpha=0.05
)
plt.xlabel('# of posts commented on')
plt.ylabel('Max number of branches touched')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

t = [(k, sum([e[0] for e in userBCount[k]]), len([e[0] for e in userBCount[k]]), max([e[0] for e in userBCount[k]])) for k in userBCount.keys()]
t = sorted(t, key=lambda x: x[2], reverse=True)
print('(username, avg branchiness/post, max branchiness/post, #posts touched)')
for i in ([(a,b/c,d,c) for (a,b,c,d) in t])[:10]:
    print(i)

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [math.log(sumB/numB) for (_, sumB, numB, _) in t],
    alpha=0.05
)
plt.xlabel('# of posts commented on')
plt.ylabel('log(Average branchiness)')
plt.show()

plt.scatter(
    [pc for (_, _, pc, _) in t], 
    [maxB for (_, _, _, maxB) in t],
    alpha=0.05
)
plt.xlabel('# of posts commented on')
plt.ylabel('Max branchiness')
plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

t = [[e[0] for e in userBCount[k]] for k in userBCount.keys()]
t = [i for sl in t for i in sl]
plt.hist(t, bins=100)
plt.xlabel('# of occurrences')
plt.ylabel('Branchiness (per user, per post)')
plt.show()

t = [[e[0] for e in userBCount[k]] for k in userBCount.keys()]
t = [i for sl in t for i in sl]
t = [i for i in t if i<5]
plt.hist(t, bins=100)
plt.xlabel('# of occurrences')
plt.ylabel('Branchiness (per user, per post) range [0,5)')
plt.show()

t = [[e[0] for e in userBCount[k]] for k in userBCount.keys()]
t = [i for sl in t for i in sl]
t = [i for i in t if i<5 and i>1]
plt.hist(t, bins=100)
plt.xlabel('# of occurrences')
plt.ylabel('Branchiness (per user, per post) range (1,5)')
plt.show()