# Clustering analysis (based on direct interactions)

In [1]:
from scipy.sparse import load_npz, save_npz, csgraph, coo_matrix
import seaborn as sns
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
import pandas as pd
import scipy.sparse.linalg as sps_linalg
import itertools as itt
import altair as alt
from collections import Counter
from matplotlib import pyplot as plt
import polars as pl

path = "../../../data/users/summaries/combined/"
adj_matrix_path = path + 'adj_matrix-indirects-min-100.npz'
user_stats_path = path + 'user_stats.csv'

## Indirect user interactions

Here, we explore indirect user interactions (a indirect interaction is defined as an interaction when two users are active on the same thread)

In [2]:
#load the adjacency matrix
adj_matrix = load_npz(adj_matrix_path).tolil()
adj_matrix.setdiag(0) #set diagonals to zero to remove any "self-interactions"
A = adj_matrix.toarray()[3:,3:] #exclude skip user and the two bots

## Clustering

We run spectral clustering and explore how well it performs with up to 30 clusters. 

In [3]:
def spectral_clustering(A, max_clusters=10):

    #calculate the Graph Laplacian manually
    D = np.diag(np.sum(A, axis=1))
    L = np.subtract(D, A)
    L = L.astype(int)

    print("Computing eigenvalues..")
    vals, vecs = np.linalg.eigh(L)    #retrieve eigenvalues
    selected_vecs = vecs[:, range(max_clusters + 1)] #get the vectors of n-smallest eigenvalues
    eigvalues = list(zip(vals, itt.count()))

    print("Running K-means..")
    #Try Kmeans and report scores
    results = []
    sizes = []
    for k in range(2, max_clusters + 1):
        vecs = selected_vecs[:,:k]
        clusters = MiniBatchKMeans(n_clusters=k, max_iter=1000, random_state=42).fit(vecs)
        cluster_sizes = sorted(Counter(clusters.labels_).values(), reverse=True)
        sizes.append((k, list(zip(itt.count(), cluster_sizes))))
        s = metrics.silhouette_score(vecs, clusters.labels_)    
        results.append((k, s))

    return selected_vecs, results, sizes, eigvalues

In [4]:
selected_vecs, results, sizes, eigvalues = spectral_clustering(A, max_clusters=30)

Computing eigenvalues..
Running K-means..


In [4]:
def plot_cluster_diagnostics(results, sizes, eigvalues):

    #plot eigenvalues to understand how they evolve
    c1 = alt.Chart(pd.DataFrame(eigvalues, columns=['eigenvalue', 'index']).head(30)).mark_line().encode(
        x='index',
        y='eigenvalue'
    ).properties(title="Eigenvalue magnitude", width=300)


    #plot silhouette scores
    clustering_performance = pd.DataFrame(results, columns=["No_clusters", "Silhouette score"])
    c2 = alt.Chart(clustering_performance).mark_point().encode(
        x="No_clusters",
        y='Silhouette score',
    ).properties(title="Silhouette scores", width=300).interactive()


    cl_df = pd.DataFrame(sizes, columns=['no_clusters', 'temp']).explode('temp')
    cl_df['cluster'] = cl_df['temp'].apply(lambda x: x[0])
    cl_df['cluster_size'] = cl_df['temp'].apply(lambda x: x[1])

    c3 = alt.Chart(cl_df).mark_bar().encode(
        x='no_clusters:O',
        y='cluster_size',
        color=alt.Color('cluster:O', scale=alt.Scale(scheme='dark2'))
    ).properties(title="Cluster sizes", width=300)

    return (c1 | c2 | c3)

In [6]:
plot_cluster_diagnostics(results, sizes, eigvalues)

Based on the above outputs, we can see that there aren't really any strong clusters. In cases where silhouette scores are high, most users simply belong to a single cluster. We'll use 12 clusters for analysis as it is the highest cluster number that still has a good silhouette score.

In [7]:
optimal_clusters = 12
vecs = selected_vecs[:,:optimal_clusters]
cluster_model = MiniBatchKMeans(n_clusters=optimal_clusters, max_iter=1000, random_state=42).fit(vecs)

In [8]:
users = pl.read_csv(user_stats_path) \
    .filter((pl.col("user_name") != "__SKIP__") &  (pl.col("user_name") != "AutoModerator") &  (pl.col("user_name") != "MAGIC_EYE_BOT")) \
    .with_columns([
        (pl.col("post_karma") / pl.col("no_posts")).alias("avg_post_karma"),
        (pl.col("comment_karma") / pl.col("no_comments")).alias("avg_comment_karma"),
        ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).alias("activity_window")
    ])
selected_users = users.filter(pl.col("total_activity") >= 100)

In [11]:
def get_basic_stats(df): 
    stats = df.lazy().groupby("cluster").agg([
            pl.col("no_posts").count().alias("Total number of users"),
            #pl.col('no_posts').sum().alias("Total posts"),    
            #pl.col('no_comments').sum().alias("Total comments"),
            #pl.col('post_karma').sum().alias("Total post karma"),
            #pl.col('comment_karma').sum().alias("Total comment karma"),
            pl.col("no_posts").mean().alias("Avg. posts per user"),
            pl.col("no_posts").median().alias("Median posts per user"),    
            pl.col("no_comments").mean().alias("Avg. comments per user"),
            pl.col("no_comments").median().alias("Median comments per user"),    
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).median().alias("Median activity window (days)"),
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).mean().alias("Mean activity window (days)"),
            (pl.when(pl.col("no_posts") > 0).then(pl.col('post_karma') / pl.col('no_posts')).otherwise(0)).mean().alias("Mean post karma"),
            (pl.when(pl.col("no_comments") > 0).then(pl.col('comment_karma') / pl.col('no_comments')).otherwise(0)).mean().alias("Mean comment karma"),
            (pl.when(pl.col("no_posts") > 0).then(pl.col('post_karma') / pl.col('no_posts')).otherwise(0)).median().alias("Median post karma"),
            (pl.when(pl.col("no_comments") > 0).then(pl.col('comment_karma') / pl.col('no_comments')).otherwise(0)).median().alias("Median comment karma"),
        ]).collect().filter(pl.col("Total number of users") > 100).transpose(include_header=True, header_name="Statistic")
    return stats

In [12]:
selected_users['cluster'] = cluster_model.labels_
get_basic_stats(selected_users)

Statistic,column_0,column_1,column_2,column_3,column_4,column_5
str,f64,f64,f64,f64,f64,f64
"""cluster""",8.0,7.0,4.0,10.0,9.0,1.0
"""Total number of users""",475.0,295.0,1026.0,4995.0,792.0,174.0
"""Avg. posts per user""",4.058947,14.305085,3.643275,2.758158,3.296717,5.155172
"""Median posts per user""",1.0,2.0,1.0,1.0,1.0,1.0
"""Avg. comments per user""",391.233684,289.6033,177.840156,227.163964,155.959596,141.936782
"""Median comments per user""",223.0,178.0,144.0,159.0,132.0,124.5
"""Median activity window (days)""",109.0,846.833333,106.958333,127.0,105.0,73.0625
"""Mean activity window (days)""",182.5319,835.2943,168.039189,224.533475,191.182029,160.331418
"""Mean post karma""",227.515637,217.484664,662.864475,262.897179,511.721287,2583.804363
"""Mean comment karma""",20.881872,11.228098,20.8391,17.978879,17.188183,14.931293


From the descriptive statistics, we can see that the clusters mostly represent activity levels - cluster 5 has highest avg post and comment levels, while other clusters have lower scores. A bit more interesting observations are:
 - Cluster 2 (11 individuals) who have significantly lower karma rates than others but have been in the subreddit the longest of all clusters, and have similar commenting activity levels. 
 - Cluster 4 (241 individuals) who have significantly higher post karma rates, despite making relatively few posts compared to others. This seems to be primarily driven by outliers, however, as the median post score in that cluster is zero.

## Clustering based on normalized user interactions

What if instead the same analysis was normalized to total user activity? We adjust the adjacency matrix by normalizing for total activity that each user has among in the top user network, which makes the edges represent the weight of interactions among users rather than the absolute interactions themselves.

In [5]:
norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100

  norm_A = np.nan_to_num(A / np.sum(A, axis=1), 0) * 100


In [6]:
selected_vecs, results, sizes, eigvalues = spectral_clustering(norm_A, max_clusters=30)

Computing eigenvalues..


In [None]:
plot_cluster_diagnostics(results, sizes, eigvalues)

In [None]:
optimal_clusters = 11
vecs = selected_vecs[:,:optimal_clusters]
cluster_model = MiniBatchKMeans(n_clusters=optimal_clusters, max_iter=1000, random_state=42).fit(vecs)
selected_users['cluster'] = cluster_model.labels_
get_basic_stats(selected_users).to_pandas().round(1)

Unnamed: 0,Statistic,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8
0,cluster,8.0,0.0,6.0,5.0,4.0,3.0,10.0,2.0,9.0
1,Total number of users,24.0,52.0,13.0,10.0,4.0,421.0,57.0,16.0,7208.0
2,Avg. posts per user,6.5,1.4,0.9,69.5,0.2,2.4,1.8,0.5,3.6
3,Median posts per user,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,Avg. comments per user,164.2,190.4,191.5,196.4,128.0,160.1,137.2,166.8,228.6
5,Median comments per user,167.5,149.0,164.0,130.0,122.0,140.0,130.0,143.0,156.0
6,Median activity window (days),128.0,132.5,115.0,215.5,138.0,122.0,245.1,113.5,125.0
7,Mean activity window (days),231.9,252.1,203.2,449.9,182.0,209.0,282.1,214.5,234.7
8,Mean post karma,133.8,253.5,1176.9,183.3,2.5,660.8,569.1,84.7,375.5
9,Mean comment karma,7.9,13.9,11.9,4.8,13.2,19.6,14.8,6.5,18.2


The results are quite similar. We find some very small clusters of users (< 60 users) which are likely arising due to interacting on a single post thread. There are only 2 larger clusters otherwise. The main differences between them seem to be quality vs. quantity aspects (cluster 3 makes less comments and less posts but achieves higher karma averages on both).

## Influencers as based on page-rank analysis
Next, we explore if the user interaction network reveals any important players as indicated by page rank algorithm

In [15]:
import networkx as nx
G = nx.from_numpy_matrix(norm_A)
pageranks = nx.pagerank(G, max_iter=100)

In [16]:
selected_users['pg_rank'] = np.array(list(pageranks.values()))
selected_users = selected_users.with_column(((1641790800 - pl.col("first_date")) / 3600 / 24).alias("longevity"))

Looking at the top 10 best connected users, we can see that they have really high average post and comment karma scores. Also, majority of them are relatively new, with most of them having participated for less than 100 days in the subreddit.

In [17]:
selected_users.sort(pl.col("pg_rank"), reverse=True).head(10)

user_name,no_posts,no_comments,post_karma,comment_karma,first_date,last_date,total_activity,avg_post_karma,avg_comment_karma,activity_window,cluster,pg_rank,longevity
str,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i32,f64,f64
"""nryan1985""",73,55,176793,442,1624914000,1641506400,128,2421.821918,8.036364,192.041667,9,0.000986,195.333333
"""Fragrant-Asparagus-2""",2,100,28361,14304,1638835200,1643241600,102,14180.5,143.04,51.0,9,0.000921,34.208333
"""ceanothourus""",1,103,73294,19270,1632960000,1636927200,104,73294.0,187.087379,45.916667,9,0.000817,102.208333
"""wexlers""",2,100,11352,7052,1636581600,1642197600,102,5676.0,70.52,65.0,9,0.000639,60.291667
"""Paratrooperkid""",5,122,13186,2833,1634601600,1642550400,127,2637.2,23.221311,92.0,9,0.000587,83.208333
"""joevinci""",4,96,25704,2658,1637272800,1640131200,100,6426.0,27.6875,33.083333,9,0.000556,52.291667
"""TruthToPower77""",102,89,223761,1064,1610402400,1644796800,191,2193.735294,11.955056,398.083333,9,0.000541,363.291667
"""caligalus""",124,31,95417,458,1633824000,1637452800,155,769.491935,14.774194,42.0,9,0.000513,92.208333
"""poisonivy47""",89,157,288418,1598,1634342400,1643587200,246,3240.651685,10.178344,107.0,9,0.000509,86.208333
"""jayzee312""",5,131,47178,5064,1620421200,1645056000,136,9435.6,38.656489,285.125,9,0.000506,247.333333


This raises an interesting question - is posting quality associated with how well connected a user is? To some degree, this is expected as more popular posts/comments attract more attention. But given that we are looking just at direct interactions, it is not an obvious relationship.

In [18]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

formula = """
np.log(pg_rank) ~ 
np.log(no_posts + 0.001) + np.log(no_comments + 0.001) + 
np.log(avg_post_karma + 0.001) +  np.log(avg_comment_karma + 0.001) + 
np.log(longevity + 0.0001) + np.log(activity_window + 0.001)"""

results = smf.ols(formula, data=selected_users.to_pandas()).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log(pg_rank)   R-squared:                       0.386
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     434.6
Date:                Thu, 24 Mar 2022   Prob (F-statistic):               0.00
Time:                        16:53:29   Log-Likelihood:                -287.63
No. Observations:                4149   AIC:                             589.3
Df Residuals:                    4142   BIC:                             633.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

  result = getattr(ufunc, method)(*inputs, **kwargs)


We find that connectedness is indeed related to content quality. Higher average post and comment karma leads to higher connectedness, even when controlling for total posts and comments made. Also, interestingly enough, longevity on the subreddit does not matter, while users that only have participated during a shorter time window seem to have higher page rank, on average. This may indicate that users with higher connectedness are largely "one-off" wonders who contribute a few items of popular content over a shorter period of time and remain inactive afterwards.