# Clustering analysis (based on indirect interactions)

In [1]:
from scipy.sparse import load_npz, save_npz, csgraph, coo_matrix
import seaborn as sns
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
import pandas as pd
import scipy.sparse.linalg as sps_linalg
import itertools as itt
import altair as alt
from collections import Counter
from matplotlib import pyplot as plt
import polars as pl

path = "../../../data/users/summaries/combined/"
adj_matrix_path = path + 'adj_matrix-indirects-min-100.npz'
user_stats_path = path + 'user_stats.csv'
results_path = path + 'indirect_user_results.csv'

## Indirect user interactions

Here, we explore indirect user interactions (a indirect interaction is defined as an interaction when two users are active on the same thread)

In [2]:
#load the adjacency matrix
adj_matrix = load_npz(adj_matrix_path).tolil()
adj_matrix.setdiag(0) #set diagonals to zero to remove any "self-interactions"
A = adj_matrix.toarray()[3:,3:] #exclude skip user and the two bots

## Clustering

We run spectral clustering and explore how well it performs with up to 30 clusters. 

In [3]:
def spectral_clustering(A, max_clusters=10):

    #calculate the Graph Laplacian manually
    D = np.diag(np.sum(A, axis=1))
    L = np.subtract(D, A)
    L = L.astype(int)

    print("Computing eigenvalues..")
    vals, vecs = np.linalg.eigh(L)    #retrieve eigenvalues
    selected_vecs = vecs[:, range(max_clusters + 1)] #get the vectors of n-smallest eigenvalues
    eigvalues = list(zip(vals, itt.count()))

    print("Running K-means..")
    #Try Kmeans and report scores
    results = []
    sizes = []
    for k in range(2, max_clusters + 1):
        vecs = selected_vecs[:,:k]
        clusters = MiniBatchKMeans(n_clusters=k, max_iter=1000, random_state=42).fit(vecs)
        cluster_sizes = sorted(Counter(clusters.labels_).values(), reverse=True)
        sizes.append((k, list(zip(itt.count(), cluster_sizes))))
        s = metrics.silhouette_score(vecs, clusters.labels_)    
        results.append((k, s))

    return selected_vecs, results, sizes, eigvalues

In [4]:
selected_vecs, results, sizes, eigvalues = spectral_clustering(A, max_clusters=30)

Computing eigenvalues..
Running K-means..


In [5]:
def plot_cluster_diagnostics(results, sizes, eigvalues):

    #plot eigenvalues to understand how they evolve
    c1 = alt.Chart(pd.DataFrame(eigvalues, columns=['eigenvalue', 'index']).head(30)).mark_line().encode(
        x='index',
        y='eigenvalue'
    ).properties(title="Eigenvalue magnitude", width=300)


    #plot silhouette scores
    clustering_performance = pd.DataFrame(results, columns=["No_clusters", "Silhouette score"])
    c2 = alt.Chart(clustering_performance).mark_point().encode(
        x="No_clusters",
        y='Silhouette score',
    ).properties(title="Silhouette scores", width=300).interactive()


    cl_df = pd.DataFrame(sizes, columns=['no_clusters', 'temp']).explode('temp')
    cl_df['cluster'] = cl_df['temp'].apply(lambda x: x[0])
    cl_df['cluster_size'] = cl_df['temp'].apply(lambda x: x[1])

    c3 = alt.Chart(cl_df).mark_bar().encode(
        x='no_clusters:O',
        y='cluster_size',
        color=alt.Color('cluster:O', scale=alt.Scale(scheme='dark2'))
    ).properties(title="Cluster sizes", width=300)

    return (c1 | c2 | c3)

In [6]:
plot_cluster_diagnostics(results, sizes, eigvalues)

Based on the above outputs, we can see that there aren't really any strong clusters. In cases where silhouette scores are high, most users simply belong to a single cluster. We'll use 12 clusters for analysis as it is the highest cluster number that still has a good silhouette score.

In [7]:
optimal_clusters = 12
vecs = selected_vecs[:,:optimal_clusters]
cluster_model = MiniBatchKMeans(n_clusters=optimal_clusters, max_iter=1000, random_state=42).fit(vecs)

In [8]:
users = pl.read_csv(user_stats_path) \
    .filter((pl.col("user_name") != "__SKIP__") &  (pl.col("user_name") != "AutoModerator") &  (pl.col("user_name") != "MAGIC_EYE_BOT")) \
    .with_columns([
        (pl.col("post_karma") / pl.col("no_posts")).alias("avg_post_karma"),
        (pl.col("comment_karma") / pl.col("no_comments")).alias("avg_comment_karma"),
        ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).alias("activity_window")
    ])
selected_users = users.filter(pl.col("total_activity") >= 100)

selected_users = pl.read_csv(results_path)

In [9]:
def get_basic_stats(df): 
    stats = df.lazy().groupby("cluster").agg([
            pl.col("no_posts").count().alias("Total number of users"),
            #pl.col('no_posts').sum().alias("Total posts"),    
            #pl.col('no_comments').sum().alias("Total comments"),
            #pl.col('post_karma').sum().alias("Total post karma"),
            #pl.col('comment_karma').sum().alias("Total comment karma"),
            pl.col("no_posts").mean().alias("Avg. posts per user"),
            pl.col("no_posts").median().alias("Median posts per user"),    
            pl.col("no_comments").mean().alias("Avg. comments per user"),
            pl.col("no_comments").median().alias("Median comments per user"),    
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).median().alias("Median activity window (days)"),
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).mean().alias("Mean activity window (days)"),
            (pl.when(pl.col("no_posts") > 0).then(pl.col('post_karma') / pl.col('no_posts')).otherwise(0)).mean().alias("Mean post karma"),
            (pl.when(pl.col("no_comments") > 0).then(pl.col('comment_karma') / pl.col('no_comments')).otherwise(0)).mean().alias("Mean comment karma"),
            (pl.when(pl.col("no_posts") > 0).then(pl.col('post_karma') / pl.col('no_posts')).otherwise(0)).median().alias("Median post karma"),
            (pl.when(pl.col("no_comments") > 0).then(pl.col('comment_karma') / pl.col('no_comments')).otherwise(0)).median().alias("Median comment karma"),
        ]).collect().filter(pl.col("Total number of users") > 100).transpose(include_header=True, header_name="Statistic")
    return stats

In [10]:
selected_users['cluster'] = cluster_model.labels_
get_basic_stats(selected_users)



Statistic,column_0,column_1,column_2,column_3,column_4,column_5
str,f64,f64,f64,f64,f64,f64
"""cluster""",8.0,7.0,4.0,10.0,9.0,1.0
"""Total number of users""",475.0,295.0,1026.0,4995.0,792.0,174.0
"""Avg. posts per user""",4.058947,14.305085,3.643275,2.758158,3.296717,5.155172
"""Median posts per user""",1.0,2.0,1.0,1.0,1.0,1.0
"""Avg. comments per user""",391.233684,289.6033,177.840156,227.163964,155.959596,141.936782
"""Median comments per user""",223.0,178.0,144.0,159.0,132.0,124.5
"""Median activity window (days)""",109.0,846.833333,106.958333,127.0,105.0,73.0625
"""Mean activity window (days)""",182.5319,835.2943,168.039189,224.533475,191.182029,160.331418
"""Mean post karma""",227.515637,217.484664,662.864475,262.897179,511.721287,2583.804363
"""Mean comment karma""",20.881872,11.228098,20.8391,17.978879,17.188183,14.931293


We can see that there is quite some variation in the attributes describing the clusters. Are they substantially different enough? We use Mann-Whitney U-test for pair-wise comparison of clusters across multiple attributes.

In [11]:
from scipy.stats import mannwhitneyu
import itertools as itt
from collections import defaultdict

clusters = [1, 4, 7, 8, 9, 10]
vars = ["avg_post_karma", "no_posts", "no_comments", "avg_comment_karma", "activity_window", "total_activity"]

results = []
for v in vars:
    for c1, c2 in itt.combinations(clusters, 2):
        vals1 = selected_users.filter(pl.col("cluster") == c1).select(v).to_numpy()
        vals2 = selected_users.filter(pl.col("cluster") == c2).select(v).to_numpy()
        test = mannwhitneyu(vals1, vals2, nan_policy = 'omit')
        results.append((v, c1, c2, test.pvalue[0]))

df = pd.DataFrame(results, columns = ['variable', 'clusterX', 'clusterY', 'p-value'])
df['significant'] = df['p-value'] < 0.0001

differences = defaultdict(lambda: defaultdict(int))

for t in df.itertuples():
    if t.significant:
        differences[t.variable][t.clusterX] += 1
        differences[t.variable][t.clusterY] += 1


In [12]:
df_diffs = pd.DataFrame(dict(differences))

df_diffs['total_diffs'] = df_diffs.sum(axis=1)
df_diffs.sort_values(by ="total_diffs",ascending = False)

Unnamed: 0,avg_post_karma,no_posts,no_comments,avg_comment_karma,activity_window,total_activity,total_diffs
10,2.0,4,4,2,5,5,22.0
7,2.0,4,4,3,5,4,22.0
1,2.0,1,4,5,5,4,21.0
8,2.0,2,5,3,3,4,19.0
4,,2,5,1,3,5,16.0
9,,1,4,2,3,4,14.0


Using p-values of `0.0001`, we find that the clusters are indeed different. Specifically, clusters 7 and 10 are the most different, with differences being significant in 22 out of 25 (5 features against 5 clusters) comparisons made. Cluster 7 seems the most interesting, as it represents users that have been around the longest, with highest average / median karma scores.

## Clustering based on normalized user interactions

What if instead the same analysis was normalized to total user activity? We adjust the adjacency matrix by normalizing for total activity that each user has among in the top user network, which makes the edges represent the weight of interactions among users rather than the absolute interactions themselves.

In [13]:
norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100

  norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100


In [14]:
selected_vecs, results, sizes, eigvalues = spectral_clustering(norm_A, max_clusters=30)

Computing eigenvalues..
Running K-means..


In [15]:
plot_cluster_diagnostics(results, sizes, eigvalues)

We can see that there are no meaningful clusters at all...

## Influencers as based on page-rank analysis
Next, we explore if the user interaction network reveals any important players as indicated by page rank algorithm

In [3]:
import networkx as nx
norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100
G = nx.from_numpy_matrix(A)
pageranks = nx.pagerank_scipy(G, max_iter=100)

  norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100


In [9]:
selected_users['indirect_pg_rank'] = np.array(list(pageranks.values()))
selected_users = selected_users.with_column(((1641790800 - pl.col("first_date")) / 3600 / 24).alias("longevity"))

Looking at the top 10 best connected users, we can see that they have really high average post and comment karma scores. Also, majority of them are relatively new, with most of them having participated for less than 100 days in the subreddit.

In [18]:
selected_users.write_csv(results_path)

In [10]:
selected_users.sort(pl.col("indirect_pg_rank"), reverse=True).head(10)

user_name,no_posts,no_comments,post_karma,comment_karma,first_date,last_date,total_activity,avg_post_karma,avg_comment_karma,activity_window,pg_rank,longevity,indirect_pg_rank
str,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
"""DirtyPenPalDoug""",27,2688,1221,47963,1611266400,1643587200,2715,45.222222,17.843378,374.083333,0.0014,353.291667,0.0014
"""BigAlTrading""",9,2475,3176,56511,1591243200,1643241600,2484,352.888889,22.832727,601.833333,0.001043,585.041667,0.001043
"""baconraygun""",0,2062,0,13668,1635292800,1643587200,2062,,6.628516,96.0,0.000982,75.208333,0.000982
"""Geminii27""",0,1707,0,18896,1401163200,1643673600,1707,,11.069713,2806.833333,0.000888,2785.041667,0.000888
"""The_Quicktrigger""",1,2653,0,66087,1632873600,1643414400,2654,0.0,24.9102,122.0,0.000867,103.208333,0.000867
"""notislant""",1,828,12,17362,1624654800,1643673600,829,12.0,20.968599,220.125,0.000857,198.333333,0.000857
"""rustys_shackled_ford""",24,1677,11031,43578,1633996800,1643673600,1701,459.625,25.985689,112.0,0.000855,90.208333,0.000855
"""YeOldeBilk""",3,862,51,49381,1618779600,1643587200,865,17.0,57.286543,287.125,0.000837,266.333333,0.000837
"""Optimal-Scientist233""",25,1317,285,14065,1635465600,1643587200,1342,11.4,10.679575,94.0,0.000835,73.208333,0.000835
"""kylorensgrandfather""",22,953,10755,14156,1634428800,1643673600,975,488.863636,14.854145,107.0,0.000764,85.208333,0.000764


This raises an interesting question - is posting quality associated with how well connected a user is? To some degree, this is expected as more popular posts/comments attract more attention. But given that we are looking just at direct interactions, it is not an obvious relationship.

In [8]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

formula = """
np.log(pg_rank) ~ 
np.log(no_posts + 0.001) + np.log(no_comments + 0.001) + 
np.log(avg_post_karma + 0.001) +  np.log(avg_comment_karma + 0.001) + 
np.log(longevity + 0.0001) + np.log(activity_window + 0.001)"""

results = smf.ols(formula, data=selected_users.to_pandas()).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log(pg_rank)   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.263
Method:                 Least Squares   F-statistic:                     247.5
Date:                Fri, 15 Apr 2022   Prob (F-statistic):          3.81e-271
Time:                        17:42:48   Log-Likelihood:                -3934.0
No. Observations:                4149   AIC:                             7882.
Df Residuals:                    4142   BIC:                             7926.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

  result = getattr(ufunc, method)(*inputs, **kwargs)


We find that connectedness is indeed related to content quality. Higher average post and comment karma leads to higher connectedness, even when controlling for total posts and comments made. Also, interestingly enough, longevity on the subreddit does not matter, while users that only have participated during a shorter time window seem to have higher page rank, on average. This may indicate that users with higher connectedness are largely "one-off" wonders who contribute a few items of popular content over a shorter period of time and remain inactive afterwards.