In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!conda create --name sna --file environment.yml

In [2]:
postings_1 = pd.read_csv('input/Postings_01052019_15052019.csv', sep=';')
postings_2 = pd.read_csv('input/Postings_16052019_31052019.csv', sep=';')
votes_1 = pd.read_csv('input/Votes_01052019_15052019.csv', sep=';')
votes_2 = pd.read_csv('input/Votes_16052019_31052019.csv', sep=';')
follow_ignore = pd.read_csv("input/Following_Ignoring_Relationships_01052019_31052019.csv", sep=";")

# Merge the two datasets
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

votes["VoteCreatedAt"] = pd.to_datetime(votes["VoteCreatedAt"], format="%Y-%m-%d %H:%M:%S").dt.date
postings["PostingCreatedAt"] = pd.to_datetime(postings["PostingCreatedAt"], format="%Y-%m-%d %H:%M:%S").dt.date

# Subsetting data

Filtering users that interacted at the middle of the interval. Possible future extension: pick time interval instead of day

In [3]:
# Find u-u tuples with their first date of interaction by vote
first_contact_vote_pairs = (votes[["UserCommunityName", "UserCreatedAt", "ID_Posting", "VoteCreatedAt"]]
 .merge(postings[["ID_Posting", "UserCommunityName", "UserCreatedAt"]], on=["ID_Posting"], how="left")
 [["UserCommunityName_x", "UserCommunityName_y", "VoteCreatedAt"]]
 .sort_values("VoteCreatedAt")
 .groupby(["UserCommunityName_x", "UserCommunityName_y"])
 .first()
 .reset_index())

first_contact_reply_pairs = (postings.dropna(subset=["ID_Posting_Parent"])[
 ["UserCommunityName", "ID_Posting_Parent", "PostingCreatedAt"]]
 .merge(postings[["ID_Posting", "UserCommunityName"]], left_on=["ID_Posting_Parent"], right_on=["ID_Posting"], how="left")
 [["UserCommunityName_x", "UserCommunityName_y", "PostingCreatedAt"]]
 .sort_values("PostingCreatedAt")
 .groupby(["UserCommunityName_x", "UserCommunityName_y"])
 .first()
 .reset_index())



Now we have a directed relationship: Person x contacted person y. We want to map this unidirectional.

In [4]:
def apply_bidirectionality(df, on="VoteCreatedAt"):
    """Here we account for bidirectionality of the contact pairs. The problem is that if user A comments user B's post,
     but user B previously commented user A's post we have 2 rows with (eventually) different dates. We take the minimum and leciographycally sort the usernames for enabling joining"""
    inv_df = (df.merge(df, left_on=["UserCommunityName_x", "UserCommunityName_y"], right_on=["UserCommunityName_y", "UserCommunityName_x"], suffixes=("", "_inv"), how="left")
    .fillna(pd.to_datetime("2050-01-01", format="%Y-%m-%d").date()))
    inv_df[f"{on}_bidirectional"] = inv_df[[f"{on}_inv",on]].min(axis=1)
    inv_df[["UserCommunityName_x", "UserCommunityName_y"]] = np.sort(inv_df[["UserCommunityName_x", "UserCommunityName_y"]], axis=1)
    return inv_df[["UserCommunityName_x", "UserCommunityName_y", f"{on}_bidirectional"]].drop_duplicates()

In [5]:
first_contact_vote_pairs_bd = apply_bidirectionality(first_contact_vote_pairs, "VoteCreatedAt")
first_contact_reply_pairs_bd = apply_bidirectionality(first_contact_reply_pairs, "PostingCreatedAt")

In [6]:
fist_contact = (first_contact_reply_pairs_bd.merge(first_contact_vote_pairs_bd, on=["UserCommunityName_x", "UserCommunityName_y"], how="outer")
    .fillna(pd.to_datetime("2050-01-01", format="%Y-%m-%d").date())) # We set a date in the future to avoid problems with the min function
fist_contact["first_contact"] = fist_contact[[
    "PostingCreatedAt_bidirectional", "VoteCreatedAt_bidirectional"]].min(axis=1)


In [7]:
time_span = fist_contact["first_contact"].max() - fist_contact["first_contact"].min()
half_time_span = time_span/2
middle = (fist_contact["first_contact"].min() + half_time_span)
middle

datetime.date(2019, 5, 18)

In [8]:
subset_user_pairs = fist_contact[fist_contact["first_contact"] == middle][["UserCommunityName_x", "UserCommunityName_y", "first_contact"]]

The user pairs can now be viewed as a set

In [9]:
selected_users = pd.concat([subset_user_pairs["UserCommunityName_x"], subset_user_pairs["UserCommunityName_y"]]).drop_duplicates()

1/3 of users had contact with someone for the first time in the middle of the interval

In [10]:
selected_users.nunique() /  pd.concat([votes["UserCommunityName"], postings["UserCommunityName"]]).nunique()

0.3341048009020236

# Defining a notion of similarity

## 1. Posting similarity
#### "Users are similar when they comment the same articles (or articles of the same ressort)"

Todo: aggregate postings based on author and target (e.g., article, channel, ressort). Either count or just stay binary

In [4]:
postings[["ID_Posting","PostingCreatedAt","PostingHeadline", "PostingComment", "UserCommunityName","ID_Article", "ArticleChannel", "ArticleRessortName"]]

Unnamed: 0,ID_Posting,PostingCreatedAt,PostingHeadline,PostingComment,UserCommunityName,ID_Article,ArticleChannel,ArticleRessortName
0,1041073586,2019-05-01 18:21:15.127,Das hat gestern bereits der Voggenhuber angefü...,schieder hatte dem inhaltlich nichts entgegenz...,Ravenspower,2000102330973,Inland,Parteien
1,1041073839,2019-05-01 18:28:22.040,,...und meinen Bezirk bekommst du als Erbe mit.,AlphaRomeo,2000102330973,Inland,Parteien
2,1041073872,2019-05-01 18:29:05.533,,"Nein, bei der ÖVP/FPÖ genauso passiert. Ich wo...",Hpolditsch,2000102330973,Inland,Parteien
3,1041080734,2019-05-01 22:37:56.010,Sie haben doch nichts gefordert??,sie haben nur die regierung kritisiert. das di...,Ravenspower,2000102330973,Inland,Parteien
4,1041080828,2019-05-01 22:42:06.310,Heute wäre der perfekte Tag für die SPÖ gewese...,"ihr noch nicht erfülltes versprechen, den silb...",Ravenspower,2000102330973,Inland,Parteien
...,...,...,...,...,...,...,...,...
395929,1042380731,2019-06-04 08:54:54.177,,Vermutlich gar keines...mir ist jedenfalls kei...,404 not found,2000103620997,User,Off-Topic
395930,1042381030,2019-06-04 09:04:32.037,,*winkt dankbar zur Gödelnummer* Du bist echt d...,404 not found,2000103620997,User,Off-Topic
395931,1042381528,2019-06-04 09:22:54.473,,"Die sind noch in Arbeit, aber der Surface läuf...",404 not found,2000103620997,User,Off-Topic
395932,1042381793,2019-06-04 09:31:45.077,,"Ich versteh das überhaupt nicht, warum so viel...",404 not found,2000103620997,User,Off-Topic


In [11]:
votes_with_article_id = (votes[votes["UserCommunityName"].isin(selected_users)][["UserCommunityName", "ID_Posting", "VotePositive", "VoteNegative"]].merge(postings[[
               "ID_Posting", "ID_Article",  "PostingCreatedAt"]], on=["ID_Posting"]))



postings_simmilarity = (votes_with_article_id
    .merge(votes_with_article_id[["UserCommunityName",    "ID_Article"]], on=["ID_Article"], how="inner")
    .query("UserCommunityName_x != UserCommunityName_y")
    .drop(columns=["ID_Article"])
    .groupby(["UserCommunityName_x", "UserCommunityName_y"])
    .sum()
    .reset_index()
    .sort_values("votes_p_n", ascending=False))

MemoryError: Unable to allocate 156. GiB for an array with shape (20967594396,) and data type int64

In [12]:
votes_with_article_id

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VotePositive,VoteNegative,ID_Article,PostingCreatedAt
0,675862,1041076570,0,1,2000102330973,2019-05-01 20:04:07.580
1,689023,1041076570,0,1,2000102330973,2019-05-01 20:04:07.580
2,606376,1041076570,1,0,2000102330973,2019-05-01 20:04:07.580
3,24810,1041076745,1,0,2000102349577,2019-05-01 20:11:30.570
4,673781,1041076745,1,0,2000102349577,2019-05-01 20:11:30.570
...,...,...,...,...,...,...
3824979,694312,1042273551,1,0,2000103963403,2019-05-31 21:15:14.137
3824980,220003,1042275757,0,1,2000103475778,2019-05-31 22:47:05.570
3824981,220003,1042279910,0,1,2000103475778,2019-06-01 08:25:35.210
3824982,654563,1042304669,1,0,2000104167823,2019-06-01 22:32:21.463


In [13]:
graph = nx.Graph()
graph.add_nodes_from(votes_with_article_id["UserCommunityName"].unique())
graph.add_nodes_from(votes_with_article_id["ID_Article"].unique())
graph.add_edges_from(list(map(tuple, votes_with_article_id[["UserCommunityName", "ID_Article"]].values)))
graph = graph.to_undirected()

still takes ages

In [16]:
from scipy.sparse import  csr_matrix
import itertools

tuples = []
data = []
data = {}
article_ids = votes_with_article_id["ID_Article"].unique()
for idx, article in enumerate(article_ids):
    print(int((idx/len(article_ids))*100), "%", end="\r")
    users_commented =list(graph.neighbors(article))
    for uu_tuple in itertools.product(users_commented, users_commented):
        if uu_tuple[0] != uu_tuple[1]:
            if uu_tuple[0] > uu_tuple[1]:
                uu_tuple = (uu_tuple[1], uu_tuple[0])
            if uu_tuple in data:
                data[uu_tuple] += 1
            else:
                data[uu_tuple] = 1




0 %

KeyboardInterrupt: 

In [14]:
num_users = votes_with_article_id["UserCommunityName"].unique()
csr_matrix((data.values(), data.keys()), shape=(num_users,num_users))

TypeError: invalid input format

In [44]:
len(n)

729

## 2. Vote similarity
#### "Users are similar when they upvote posts of the same author"
Too complex -> jupyter kernel dies TODO remove or try on jupyerlab

In [None]:
vote_author_posting_author = pd.merge(votes[["UserCommunityName", "ID_Posting", "VotePositive", "VoteNegative"]], postings[[
               "ID_Posting", "UserCommunityName",  "PostingCreatedAt"]], on=["ID_Posting"])\
                .rename(columns={"UserCommunityName_x": "vote_author", "UserCommunityName_y": "posting_author"})\
                        .groupby(["vote_author", "posting_author"]).sum().reset_index()

# same vote authors and posting authors
number_of_votes_per_author = vote_author_posting_author.assign(votes = lambda x: x.VotePositive + x.VoteNegative).groupby(["posting_author"])["votes"].sum()
influental_authors = number_of_votes_per_author[number_of_votes_per_author> 5].reset_index()["posting_author"]
print(f"Proportion of authors having more than 5 votes: {influental_authors.shape[0]/number_of_votes_per_author.shape[0]}")



CAUTION!!

In [None]:
votes_simmilarity = (vote_author_posting_author.assign(votes_p_n = lambda x: - x.VoteNegative + x.VotePositive)
    [["vote_author",    "posting_author", "votes_p_n"]]
    .merge(vote_author_posting_author[["vote_author",    "posting_author"]], on=["posting_author"], how="inner")
    .query("vote_authory_x != vote_author_y")
    .drop(columns=["ID_Posting"])
    .groupby(["vote_author_x", "vote_author_y"])
    .sum()
    .reset_index()
    .sort_values("votes_p_n", ascending=False))

## Negative same downvotes

## Positive same upvotes 

#### Users are similar when the use similar words
idk