### Load packages

In [15]:
# Create enviroment if not done before. 
#!conda create --name sna --file environment.yml

In [16]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

### Load an concat data

In [17]:
postings_1 = pd.read_csv('input/Postings_01052019_15052019.csv', sep=';')
postings_2 = pd.read_csv('input/Postings_16052019_31052019.csv', sep=';')
votes_1 = pd.read_csv('input/Votes_01052019_15052019.csv', sep=';')
votes_2 = pd.read_csv('input/Votes_16052019_31052019.csv', sep=';')
follow_ignore = pd.read_csv(
    "input/Following_Ignoring_Relationships_01052019_31052019.csv", sep=";")

# Merge the two datasets
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

Since the posts are stored in a flat format and replies only relate to their parent, we want to add a top level posting column to our data.

In [18]:
def get_top_level(parent_id):
    if np.isnan(parent_id):
        return np.nan

    parent = postings[postings["ID_Posting"] == parent_id]
    if len(parent) == 0: # no parent
        return parent_id
    else:
        parents_parent = parent["ID_Posting_Parent"].values[0]

        if np.isnan(parents_parent): # no parents parent
            return parent_id
        else:
            return get_top_level(parents_parent)


postings['ID_Posting_Top_Level_Parent'] = postings['ID_Posting_Parent'].apply(get_top_level)

![](img/vis_id_posting_top_level_parent.png)

The number of articles both users wrote a posting

In [19]:
def similarity_on_article():
    data = pd.merge(postings, postings, on=["ID_Article"])
    filtered = data[data["UserCommunityName_x"] != data["UserCommunityName_y"]]
    aggregated = filtered \
        .groupby(["UserCommunityName_x", "UserCommunityName_y"])["ID_Article"] \
        .nunique().rename("similarity")
    return aggregated

similarity_on_article()

UserCommunityName_x  UserCommunityName_y                 
##V+##               2¢                                      1
                     372981cf-896d-4aad-8c3e-3224fd13fc0c    1
                     637472817                               1
                     Abbalah                                 1
                     Aktivieren                              1
                                                            ..
überdrüssig          taps2017                                1
                     wandkalender                            1
                     zweiter sieger                          1
                     §83SPG                                  1
                     Äskulap                                 1
Name: similarity, Length: 263640, dtype: int64

Number of postings two user posted a comment

In [23]:
def similarity_on_post():
    data = pd.merge(postings, postings, on=["ID_Posting_Top_Level_Parent"])
    filtered = data[data["UserCommunityName_x"] != data["UserCommunityName_y"]]
    aggregated = filtered \
        .groupby(["UserCommunityName_x", "UserCommunityName_y"])["ID_Posting_Top_Level_Parent"] \
        .nunique().rename("similarity")
    return aggregated

similarity_on_post()

UserCommunityName_x                          UserCommunityName_y  
##V+##                                       Georg Pichler            1
                                             Secrets of Perfection    1
*****Da beißt die Maus keinen Faden ab*****  *Andreas*                0
                                             -whiteout-               0
                                             .&,                      0
                                                                     ..
österix                                      §83SPG                   0
                                             µµµµµ                    0
                                             Äolus                    0
                                             Äskulap                  0
                                             äh, und...               0
Name: similarity, Length: 2059678, dtype: int64

How many times they directly replied to each other

In [24]:
def direct_comment():
    data = pd.merge(postings, postings, left_on="ID_Posting", right_on="ID_Posting_Parent")
    filtered = data[data["UserCommunityName_x"] != data["UserCommunityName_y"]]
    filtered1 = filtered[["UserCommunityName_x", "UserCommunityName_y", "ID_Article_x"]]\
        .rename(columns={"UserCommunityName_x": "user_1", "UserCommunityName_y": "user_2", "ID_Article_x": "ID_Article"})

    filtered2 = filtered[["UserCommunityName_x", "UserCommunityName_y", "ID_Article_x"]]\
        .rename(columns={"UserCommunityName_y": "user_1", "UserCommunityName_x": "user_2", "ID_Article_x": "ID_Article"})

    return pd.concat([filtered1, filtered2]).groupby(["user_1", "user_2"])["ID_Article"].count().rename("similarity")

direct_comment()

user_1      user_2                                
##V+##      Georg Pichler                             2
*Andreas*   Bioberni31                                1
            alifant                                   1
            allergische Reaktion                      1
            el-che-vive_1 Guevara                     1
                                                     ..
µ-sam       Beobachter zweiter Ordnung                1
Æthelwulf   Shagga Son of Dolf                        1
äh, und...  Unter faulen Äpfeln hat man wenig Wahl    1
            habe keinen Namen                         1
österix     seefahrer-horst                           1
Name: similarity, Length: 1636, dtype: int64