In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from scripts.load_data import load_postings, load_votes, get_first_contact_df, subset_users

# reload imports jupyter magic
%load_ext autoreload
%autoreload 2

In [2]:
votes = load_votes("input/")
postings = load_postings("input/")
votes = votes.sort_values("VoteCreatedAt")
postings["num_interactions"] = postings.groupby("UserCommunityName")["PostingCreatedAt"].cumcount()

Votes loaded
Postings loaded


# Subsetting data

We don't want to have the following users
- users that interacted with only a few posts
- users that interacted with many posts, but in few days creating a skewed distribution

Therefore we set a threshold on the number of days a user has to interact minimum

Filtering users that interacted at the middle of the interval. Possible future extension: pick time interval instead of day

In [3]:
user_selection = subset_users(votes, postings, "both", num_days_min=30, firt_interaction_middle=True)

  return umr_minimum(a, axis, None, out, keepdims, initial, where)


In [4]:
user_selection.nunique() /  pd.concat([votes["UserCommunityName"], postings["UserCommunityName"]]).nunique()

0.2630704409233873

# Defining a notion of similarity

## 1. Comment article similarity
#### "Users are similar when they comments under the same articles (or articles of the same ressort)"

Todo: aggregate postings based on author and target (e.g., article, channel, ressort). Either count or just stay binary

In [4]:
selected_postings = postings[postings["UserCommunityName"].isin(user_selection)].sort_values("PostingCreatedAt")
selected_postings["UserCommunityName"] = "user_" + selected_postings["UserCommunityName"]


In [5]:
import itertools
# one can adapt this with e.g. ressort instead of article

def create_graph(df, article_or_ressort, user="UserCommunityName"):
    graph = nx.Graph()
    graph.add_nodes_from(df[article_or_ressort].unique())
    graph.add_nodes_from(df[user].unique())
    graph.add_edges_from(list(map(tuple, df[[article_or_ressort, user]].drop_duplicates().values)))
    graph = graph.to_undirected()
    return graph

def compute_overlap(graph, df, article_or_ressort,verbose=False):
    uu_overlap = {}
    article_ids = df[article_or_ressort].unique()
    for idx, article in enumerate(article_ids):
        if verbose: print(round((idx/len(article_ids))*100), "%", end="\r")
        users_commented =list(graph.neighbors(article))
        for uu_tuple in itertools.product(users_commented, users_commented):
            if uu_tuple[0] != uu_tuple[1]:
                if uu_tuple[0] > uu_tuple[1]:
                    uu_tuple = (uu_tuple[1], uu_tuple[0])
                if uu_tuple in uu_overlap:
                    uu_overlap[uu_tuple] += 1
                else:
                    uu_overlap[uu_tuple] = 1
                    
    return uu_overlap


In [6]:
def user_lookup_df(df, article_or_ressort):
    user_num_articles = df[["UserCommunityName", article_or_ressort]].drop_duplicates()\
        .groupby(["UserCommunityName"]).size().to_frame()
    # make dict of users and the number of articles they commented on
    user_num_articles = dict(zip(user_num_articles.index, user_num_articles[0]))
    return user_num_articles

Get the amount of common articles two users posted on

In [7]:
def compute_similarity(uu_overlap, user_num_articles, chunckIdx):
    similarities = []
    for uu_tuple in uu_overlap.keys():
        overlap = uu_overlap[uu_tuple]
        try:
            union = user_num_articles[uu_tuple[0]] + user_num_articles[uu_tuple[1]]
        except:
            print(uu_tuple)
        similarities += [[uu_tuple[0],uu_tuple[1], overlap / union]]
    return pd.DataFrame(similarities, columns=["A", "B", f"Similarity_{chunckIdx}"]).set_index(["A", "B"])


In [8]:
graph_article = create_graph(selected_postings, "ID_Article", "UserCommunityName")
uu_overlap_article = compute_overlap(graph_article, selected_postings, "ID_Article")
user_num_articles = user_lookup_df(selected_postings, "ID_Article")
similarity_table_article = compute_similarity(uu_overlap_article, user_num_articles)
similarity_table_article.to_csv("output/similarity_table_article_all.csv", index=False)
del similarity_table_article, graph_article, uu_overlap_article, user_num_articles

TypeError: compute_similarity() missing 1 required positional argument: 'chunckIdx'

In [8]:
def compute_time_base_similiarities(selected_postings, article_or_ressort, num_chunks=30):
    #chunks = []
    for chunckIdx, subset_df  in enumerate(np.array_split(selected_postings,num_chunks)):
        print(round(chunckIdx/num_chunks) *100, " %", end="\r")
        graph_ressort = create_graph(subset_df, article_or_ressort, "UserCommunityName")
        uu_overlap_ressort = compute_overlap(graph_ressort, subset_df, article_or_ressort)
        user_num_article_or_ressort = user_lookup_df(subset_df, article_or_ressort)
        similarity_table_ressort = compute_similarity(uu_overlap_ressort, user_num_article_or_ressort,chunckIdx)
        #chunks += [similarity_table_ressort]
        yield similarity_table_ressort

In [9]:
time_similarity_df = compute_time_base_similiarities(selected_postings, "ID_Article",15)
time_similarity_df = pd.concat(time_similarity_df, axis=1)
time_similarity_df.to_csv("output/time_similarity_table_ressort.csv", index=False)

100  %

: 

: 