In [2]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!conda create --name sna --file environment.yml

In [None]:
!sh run-neo4j.sh

In [116]:
import py2neo

# Connect to the graph database
graph = py2neo.Graph("bolt://localhost:7687", auth=("neo4j", "sna"))


In [69]:
postings_1 = pd.read_csv('input/Postings_01052019_15052019.csv', sep=';')
postings_2 = pd.read_csv('input/Postings_16052019_31052019.csv', sep=';')
votes_1 = pd.read_csv('input/Votes_01052019_15052019.csv', sep=';')
votes_2 = pd.read_csv('input/Votes_16052019_31052019.csv', sep=';')
follow_ignore = pd.read_csv("input/Following_Ignoring_Relationships_01052019_31052019.csv", sep=";")

# Merge the two datasets
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

votes["VoteCreatedAt"] = pd.to_datetime(votes["VoteCreatedAt"], format="%Y-%m-%d %H:%M:%S").dt.date
votes = votes.sort_values("VoteCreatedAt")
votes["num_interactions"] = votes.groupby("UserCommunityName")["VoteCreatedAt"].cumcount()

postings["PostingCreatedAt"] = pd.to_datetime(postings["PostingCreatedAt"], format="%Y-%m-%d %H:%M:%S").dt.date
postings = postings.sort_values("PostingCreatedAt")
postings["num_interactions"] = postings.groupby("UserCommunityName")["PostingCreatedAt"].cumcount()

In [163]:
from py2neo import Graph, Node, Relationship

tx = graph.begin()
for index, row in votes.iterrows():
    user =  Node("User", UserCommunityName=row['UserCommunityName'], UserGender=row["UserGender"])
    posting =  Node("Posting", ID_Posting=row['ID_Posting'])
    vote = Relationship(user, "Vote", posting, VoteCreatedAt=row['VoteCreatedAt'], VotePositive=row['VotePositive'], VoteNegative=row['VoteNegative'])
    tx.create(vote)
graph.commit(tx)


tx = graph.begin()
for index, row in postings.iterrows():
    article =  Node("Article", ArticlePublishingDate=row['ArticlePublishingDate'], ArticleTitle=row["ArticleTitle"], ArticleChannel=row["ArticleChannel"], ArticleRessortName=row["ArticleRessortName"])
    # TODO make edge for resort and channel?
    posting =  Node("Posting", ID_Posting=row['ID_Posting'], PostingCreatedAt=row['PostingCreatedAt'], PostingComment=row['PostingComment'], PostingHeadline=row['PostingHeadline'])
    posting_parent = Node("Posting", ID_Posting=row['ID_Posting_Parent'])
    replies = Relationship(posting_parent, "Reply", posting)
    posts_on_article = Relationship(posting, "PostsOnArticle", article)
    posting_author= Node("User", UserCommunityName=row['UserCommunityName'], UserGender=row["UserGender"])
    comments=Relationship(posting_author, "Comments", posting)

    tx.merge(posting, "Posting", "ID_Posting")
    tx.merge(posting_parent, "Posting", "ID_Posting")
    tx.merge(posting_author, "User", "UserCommunityName")
    tx.create(replies)
    tx.create(comments)
    tx.create(posts_on_article)

graph.commit(tx)


KeyboardInterrupt: 

Visualize the graph at http://localhost:7474

In [162]:
# delete all nodes and edges
graph.run('''MATCH (n)
OPTIONAL MATCH (n)-[r]-()
DELETE n,r''')

In [154]:
graph.run('''MATCH (a:User)-[r:Vote]->(b:Posting) RETURN a.UserCommunityName, b.ID_Posting, r.VoteCreatedAt''').to_data_frame()

Unnamed: 0,a.UserCommunityName,b.ID_Posting,r.VoteCreatedAt
0,Roky Erickson,1041078078,2019-05-01
1,Hollerbusch,1041072055,2019-05-01
2,4freedom,1041073526,2019-05-01
3,RudiSemmel,1041073507,2019-05-01
4,Dropkick1990,1041062094,2019-05-01


We don't want to have the following users
- users that interacted with only a few posts
- users that interacted with many posts, but in few days creating a skewed distribution

Therefore we set a threshold on the number of days a user has to interact minimum

In [4]:
NUM_DAYS_INTERACTED_MIN = 20

In [72]:
v = votes[["UserCommunityName","VoteCreatedAt"]].rename(columns={"VoteCreatedAt":"CreatedAt"})
p = postings[["UserCommunityName", "PostingCreatedAt"]].rename(columns={"PostingCreatedAt":"CreatedAt"})

num_days_interacted = pd.concat([v,p]).groupby(["UserCommunityName","CreatedAt"]).size().reset_index()\
                .groupby("UserCommunityName").size().reset_index()

user_subset_days_interacted = num_days_interacted[num_days_interacted[0] >= NUM_DAYS_INTERACTED_MIN].UserCommunityName.unique()

# Subsetting data

Filtering users that interacted at the middle of the interval. Possible future extension: pick time interval instead of day

In [78]:
first_contact_vote_pairs

Unnamed: 0,UserCommunityName_x,UserCommunityName_y,VoteCreatedAt,num_interactions
0,!! Melzer = Doppelspieler !!,1816/55,2019-05-30,3
1,!! Melzer = Doppelspieler !!,Hasan_Vural,2019-05-03,0
2,!! Melzer = Doppelspieler !!,Mohrdred,2019-05-08,1
3,!! Melzer = Doppelspieler !!,Wissender,2019-05-30,2
4,!!!DerAbgrund!!!,Gewerkschaftelhuber,2019-05-30,4
...,...,...,...,...
2953357,Žarko Jankovic,hoferharald,2019-05-01,0
2953358,Žarko Jankovic,moejoe187,2019-05-02,6
2953359,Žarko Jankovic,puppetmaster,2019-05-02,13
2953360,Žarko Jankovic,stndrd,2019-05-02,14


In [77]:
# Find u-u tuples with their first date of interaction by vote
first_contact_vote_pairs = (votes[["UserCommunityName", "UserCreatedAt", "ID_Posting", "VoteCreatedAt", "num_interactions"]]
 .merge(postings[["ID_Posting", "UserCommunityName", "UserCreatedAt"]], on=["ID_Posting"], how="left")
 [["UserCommunityName_x", "UserCommunityName_y", "VoteCreatedAt","num_interactions"]]
 .sort_values("VoteCreatedAt")
 .groupby(["UserCommunityName_x", "UserCommunityName_y"])
 .first()
 .reset_index())

first_contact_reply_pairs = (postings.dropna(subset=["ID_Posting_Parent"])[
 ["UserCommunityName", "ID_Posting_Parent", "PostingCreatedAt"]]
 .merge(postings[["ID_Posting", "UserCommunityName"]], left_on=["ID_Posting_Parent"], right_on=["ID_Posting"], how="left")
 [["UserCommunityName_x", "UserCommunityName_y", "PostingCreatedAt"]]
 .sort_values("PostingCreatedAt")
 .groupby(["UserCommunityName_x", "UserCommunityName_y"])
 .first()
 .reset_index())



Now we have a directed relationship: Person x contacted person y. We want to map this unidirectional.

In [106]:
def apply_bidirectionality(df, on="VoteCreatedAt"):
    """Here we account for bidirectionality of the contact pairs. The problem is that if user A comments user B's post,
     but user B previously commented user A's post we have 2 rows with (eventually) different dates. We take the minimum and leciographycally sort the usernames for enabling joining"""
    inv_df = (df.merge(df, left_on=["UserCommunityName_x", "UserCommunityName_y"], right_on=["UserCommunityName_y", "UserCommunityName_x"], suffixes=("", "_inv"), how="left"))
    inv_df[[f"{on}_inv",f"{on}"]] = inv_df[[f"{on}_inv",f"{on}"]].fillna(pd.to_datetime("2050-01-01", format="%Y-%m-%d").date())
    inv_df[f"{on}_bidirectional"] = inv_df[[f"{on}_inv",on]].min(axis=1)
    inv_df[["UserCommunityName_x", "UserCommunityName_y"]] = np.sort(inv_df[["UserCommunityName_x", "UserCommunityName_y"]], axis=1)
    return inv_df[["UserCommunityName_x", "UserCommunityName_y", f"{on}_bidirectional"]].drop_duplicates()

In [107]:
# VARIANT A: we use dates to determine first contact
first_contact_vote_pairs_bd = apply_bidirectionality(first_contact_vote_pairs, "VoteCreatedAt")
first_contact_reply_pairs_bd = apply_bidirectionality(first_contact_reply_pairs, "PostingCreatedAt")

In [9]:
fist_contact = (first_contact_reply_pairs_bd.merge(first_contact_vote_pairs_bd, on=["UserCommunityName_x", "UserCommunityName_y"], how="outer")
    .fillna(pd.to_datetime("2050-01-01", format="%Y-%m-%d").date())) # We set a date in the future to avoid problems with the min function
fist_contact["first_contact"] = fist_contact[[
    "PostingCreatedAt_bidirectional", "VoteCreatedAt_bidirectional"]].min(axis=1)


In [10]:
time_span = fist_contact["first_contact"].max() - fist_contact["first_contact"].min()
half_time_span = time_span/2
middle = (fist_contact["first_contact"].min() + half_time_span)
middle

datetime.date(2019, 5, 18)

In [11]:
subset_user_pairs = fist_contact[fist_contact["first_contact"] == middle][["UserCommunityName_x", "UserCommunityName_y", "first_contact"]]

The user pairs can now be viewed as a set

In [12]:
selected_users = pd.concat([subset_user_pairs["UserCommunityName_x"], subset_user_pairs["UserCommunityName_y"]]).drop_duplicates() # middle interval subset
selected_users = selected_users[selected_users.isin(user_subset_days_interacted)]  # filter out users with less than X days of interaction

Percentage of remaining users

In [13]:
selected_users.nunique() /  pd.concat([votes["UserCommunityName"], postings["UserCommunityName"]]).nunique()

0.16720075959883687

# Defining a notion of similarity

## 1. Voting article similarity
#### "Users are similar when they vote on comments of the same articles (or articles of the same ressort)"

Todo: aggregate postings based on author and target (e.g., article, channel, ressort). Either count or just stay binary

In [58]:
votes_with_article_id = (votes[votes["UserCommunityName"].isin(selected_users)][["UserCommunityName", "ID_Posting", "VotePositive", "VoteNegative","VoteCreatedAt"]].merge(postings[[
               "ID_Posting", "ID_Article"]], on=["ID_Posting"])).sort_values("VoteCreatedAt")


# fucks up the memory
# postings_simmilarity = (votes_with_article_id
#     .merge(votes_with_article_id[["UserCommunityName",    "ID_Article"]], on=["ID_Article"], how="inner")
#     .query("UserCommunityName_x != UserCommunityName_y")
#     .drop(columns=["ID_Article"])
#     .groupby(["UserCommunityName_x", "UserCommunityName_y"])
#     .sum()
#     .reset_index()
#     .sort_values("votes_p_n", ascending=False))

In [59]:
votes_with_article_id

Unnamed: 0,UserCommunityName,ID_Posting,VotePositive,VoteNegative,VoteCreatedAt,ID_Article
635359,Ute Putz,1041073683,1,0,2019-05-01,2000102295179
709690,Migrationstalent,1041075279,1,0,2019-05-01,2000102333198
688571,28543295-0d5c-4dc1-a702-89ad9aaca804,1041073494,1,0,2019-05-01,2000102346520
975888,Titeuf,1041066331,1,0,2019-05-01,2000102337910
975887,mauserle,1041066331,1,0,2019-05-01,2000102337910
...,...,...,...,...,...,...
1977724,Salzig,1041991992,1,0,2019-06-04,2000103797091
2461300,schrein,1042125748,1,0,2019-06-04,2000103903057
1707249,anexity,1041564226,0,1,2019-06-04,2000101983559
2668007,Die Gezeichneten,1042378727,1,0,2019-06-04,2000103620997


In [64]:
votes_with_article_id["num_interaction"]=votes_with_article_id.groupby("UserCommunityName")["VoteCreatedAt"].cumcount()

635359        0
709690        0
688571        0
975888        0
975887        0
           ... 
1977724     221
2461300    2138
1707249     238
2668007    2850
2398430     356
Length: 2721484, dtype: int64

In [17]:
graph = nx.Graph()
graph.add_nodes_from(votes_with_article_id["UserCommunityName"].unique())
graph.add_nodes_from(votes_with_article_id["ID_Article"].unique())
graph.add_edges_from(list(map(tuple, votes_with_article_id[["UserCommunityName", "ID_Article"]].values)))
graph = graph.to_undirected()

In [18]:
import itertools

data = {}
article_ids = votes_with_article_id["ID_Article"].unique()
for idx, article in enumerate(article_ids):
    print(int((idx/len(article_ids))*100), "%", end="\r")
    users_commented =list(graph.neighbors(article))
    for uu_tuple in itertools.product(users_commented, users_commented):
        if uu_tuple[0] != uu_tuple[1]:
            if uu_tuple[0] > uu_tuple[1]:
                uu_tuple = (uu_tuple[1], uu_tuple[0])
            if uu_tuple in data:
                data[uu_tuple] += 1
            else:
                data[uu_tuple] = 1


99 %%

Get the amount of common articles two users posted on

In [31]:
data[('Heckscheibenwischer', 'Heinz Fettleber')]

146

## 2. Vote similarity
#### "Users are similar when they upvote posts of the same author"
Too complex -> jupyter kernel dies TODO remove or try on jupyerlab

In [None]:
vote_author_posting_author = pd.merge(votes[["UserCommunityName", "ID_Posting", "VotePositive", "VoteNegative"]], postings[[
               "ID_Posting", "UserCommunityName",  "PostingCreatedAt"]], on=["ID_Posting"])\
                .rename(columns={"UserCommunityName_x": "vote_author", "UserCommunityName_y": "posting_author"})\
                        .groupby(["vote_author", "posting_author"]).sum().reset_index()

# same vote authors and posting authors
number_of_votes_per_author = vote_author_posting_author.assign(votes = lambda x: x.VotePositive + x.VoteNegative).groupby(["posting_author"])["votes"].sum()
influental_authors = number_of_votes_per_author[number_of_votes_per_author> 5].reset_index()["posting_author"]
print(f"Proportion of authors having more than 5 votes: {influental_authors.shape[0]/number_of_votes_per_author.shape[0]}")



CAUTION!!

In [None]:
votes_simmilarity = (vote_author_posting_author.assign(votes_p_n = lambda x: - x.VoteNegative + x.VotePositive)
    [["vote_author",    "posting_author", "votes_p_n"]]
    .merge(vote_author_posting_author[["vote_author",    "posting_author"]], on=["posting_author"], how="inner")
    .query("vote_authory_x != vote_author_y")
    .drop(columns=["ID_Posting"])
    .groupby(["vote_author_x", "vote_author_y"])
    .sum()
    .reset_index()
    .sort_values("votes_p_n", ascending=False))

## Negative same downvotes

## Positive same upvotes 

#### Users are similar when the use similar words
idk