In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from scripts.load_data import load_postings, load_votes, get_first_contact_df, subset_users

# reload imports jupyter magic
%load_ext autoreload
%autoreload 2

random neo4j test: please ignore

In [None]:
!conda create --name sna --file environment.yml

In [None]:
!sh run-neo4j.sh

In [116]:
import py2neo

# Connect to the graph database
graph = py2neo.Graph("bolt://localhost:7687", auth=("neo4j", "sna"))


In [3]:
votes = load_votes("input/")
postings = load_postings("input/")
votes = votes.sort_values("VoteCreatedAt")
postings["num_interactions"] = postings.groupby("UserCommunityName")["PostingCreatedAt"].cumcount()

Votes loaded
Postings loaded


In [None]:
from py2neo import Graph, Node, Relationship

tx = graph.begin()
for index, row in votes.iterrows():
    user =  Node("User", UserCommunityName=row['UserCommunityName'], UserGender=row["UserGender"])
    posting =  Node("Posting", ID_Posting=row['ID_Posting'])
    vote = Relationship(user, "Vote", posting, VoteCreatedAt=row['VoteCreatedAt'], VotePositive=row['VotePositive'], VoteNegative=row['VoteNegative'])
    tx.create(vote)
graph.commit(tx)


tx = graph.begin()
for index, row in postings.iterrows():
    article =  Node("Article", ArticlePublishingDate=row['ArticlePublishingDate'], ArticleTitle=row["ArticleTitle"], ArticleChannel=row["ArticleChannel"], ArticleRessortName=row["ArticleRessortName"])
    # TODO make edge for resort and channel?
    posting =  Node("Posting", ID_Posting=row['ID_Posting'], PostingCreatedAt=row['PostingCreatedAt'], PostingComment=row['PostingComment'], PostingHeadline=row['PostingHeadline'])
    posting_parent = Node("Posting", ID_Posting=row['ID_Posting_Parent'])
    replies = Relationship(posting_parent, "Reply", posting)
    posts_on_article = Relationship(posting, "PostsOnArticle", article)
    posting_author= Node("User", UserCommunityName=row['UserCommunityName'], UserGender=row["UserGender"])
    comments=Relationship(posting_author, "Comments", posting)

    tx.merge(posting, "Article",[ "ArticleTitle", "ArticlePublishingDate"])
    tx.merge(article, "Posting", "ID_Posting")
    tx.merge(posting_parent, "Posting", "ID_Posting")
    tx.merge(posting_author, "User", "UserCommunityName")
    tx.create(replies)
    tx.create(comments)
    tx.create(posts_on_article)

graph.commit(tx)


In [None]:
graph.run("""
CREATE CONSTRAINT postingidunique
FOR (n:Posting) REQUIRE n.ID_Posting IS UNIQUE""")

graph.run("""
CREATE CONSTRAINT usernameunqiue
FOR (n:User) REQUIRE n.UserCommunityName IS UNIQUE
""")
#graph.run("CREATE CONSTRAINT ON (n:User) ASSERTS n.UserCommunityName IS UNIQUE")
#graph.run("CREATE INDEX ON :Posting(ID_Posting)")
#graph.run("CREATE INDEX ON :Article(ArticleTitle, ArticlePublishingDate)")

Visualize the graph at http://localhost:7474

In [168]:
# delete all nodes and edges
graph.run('''MATCH (n)
OPTIONAL MATCH (n)-[r]-()
DELETE n,r''')

In [167]:
graph.run('''MATCH (a:User)-[r:Vote]->(b:Posting) RETURN a.UserCommunityName, b.ID_Posting, r.VoteCreatedAt''').to_data_frame()

# Subsetting data

We don't want to have the following users
- users that interacted with only a few posts
- users that interacted with many posts, but in few days creating a skewed distribution

Therefore we set a threshold on the number of days a user has to interact minimum

Filtering users that interacted at the middle of the interval. Possible future extension: pick time interval instead of day

In [9]:
user_selection = subset_users(votes, postings, "both", num_days_min=30, firt_interaction_middle=True)

In [10]:
user_selection.nunique() /  pd.concat([votes["UserCommunityName"], postings["UserCommunityName"]]).nunique()

0.2630704409233873

# Defining a notion of similarity

## 1. Comment article similarity
#### "Users are similar when they comments under the same articles (or articles of the same ressort)"

Todo: aggregate postings based on author and target (e.g., article, channel, ressort). Either count or just stay binary

In [12]:
selected_postings = postings[postings["UserCommunityName"].isin(user_selection)].sort_values("PostingCreatedAt")


In [13]:
import itertools
# one can adapt this with e.g. ressort instead of article

graph = nx.Graph()
graph.add_nodes_from(selected_postings["UserCommunityName"].unique())
graph.add_nodes_from(selected_postings["ID_Article"].unique())
graph.add_edges_from(list(map(tuple, selected_postings[["UserCommunityName", "ID_Article"]].values)))
graph = graph.to_undirected()

uu_overlap = {}
article_ids = selected_postings["ID_Article"].unique()
for idx, article in enumerate(article_ids):
    print(round((idx/len(article_ids))*100), "%", end="\r")
    users_commented =list(graph.neighbors(article))
    for uu_tuple in itertools.product(users_commented, users_commented):
        if uu_tuple[0] != uu_tuple[1]:
            if uu_tuple[0] > uu_tuple[1]:
                uu_tuple = (uu_tuple[1], uu_tuple[0])
            if uu_tuple in uu_overlap:
                uu_overlap[uu_tuple] += 1
            else:
                uu_overlap[uu_tuple] = 1


100 %

In [14]:
user_num_articles = postings[["UserCommunityName", "ID_Article"]].drop_duplicates().groupby(["UserCommunityName"]).size().to_frame()
# make dict of users and the number of articles they commented on
user_num_articles = dict(zip(user_num_articles.index, user_num_articles[0]))

Get the amount of common articles two users posted on

In [15]:
overlap = uu_overlap[('Heckscheibenwischer', 'Heinz Fettleber')]
union = user_num_articles["Heinz Fettleber"] + user_num_articles["Heckscheibenwischer"]
overlap, (user_num_articles["Heinz Fettleber"], user_num_articles["Heckscheibenwischer"]) ,overlap / union

(82, (118, 461), 0.14162348877374784)

In [16]:
similarities = []
for uu_tuple in uu_overlap.keys():
    overlap = uu_overlap[uu_tuple]
    union = user_num_articles[uu_tuple[0]] + user_num_articles[uu_tuple[1]]
    similarities += [[uu_tuple[0],uu_tuple[1], overlap / union]]

In [None]:
similarity_table = pd.DataFrame(np.array(similarities), columns=["A", "B", "Similarity"])

In [None]:
similarity_table
# TODO split time 

(13111419, 3)