# Descriptive User Statistics
This notebook contains basic user statistics analysis to be included in the report.

In [1]:
import sys
sys.path.append("../../../src")

import polars as pl
import altair as alt
import latex_tables as lx
import numpy as np
import datetime
import pandas as pd
import importlib as imp
imp.reload(lx)

path = '../../../data/users/summaries/combined/user_stats.csv'
export_path = '../../../exports/user_statistics/'

In [2]:
users = pl.read_csv(path).filter((pl.col("user_name") != "__SKIP__") &  (pl.col("user_name") != "AutoModerator") &  (pl.col("user_name") != "MAGIC_EYE_BOT")).with_columns([
            (pl.col("post_karma") / pl.col("no_posts")).alias("avg_post_karma"),
            (pl.col("comment_karma") / pl.col("no_comments")).alias("avg_comment_karma"),
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).alias("activity_window")
        ])
selected_users = users.filter(pl.col("total_activity") >= 100)
top5000 = selected_users.head(5000)

In [6]:
bot_users = pl.read_csv(path).filter((pl.col("user_name") == "__SKIP__") |  (pl.col("user_name") == "AutoModerator") |  (pl.col("user_name") == "MAGIC_EYE_BOT"))
bot_users

user_name,no_posts,no_comments,post_karma,comment_karma,first_date,last_date,total_activity
str,i64,i64,i64,i64,i64,i64,i64
"""__SKIP__""",61351,0,0,0,1645308000,915166800,61351
"""AutoModerator""",87,42677,4332,29374,1571803200,1645833600,42764
"""MAGIC_EYE_BOT""",0,10150,0,10424,1627862400,1643673600,10150


In [23]:
subset_sizes = [(threshold, users.filter(pl.col("total_activity") >= threshold).shape[0]) for threshold in range(5, 151)]

alt.Chart(pd.DataFrame(subset_sizes, columns=["threshold", "no_users"])).mark_line().encode(
    x="threshold",
    y="no_users"
)

## Basic stats
 - Number of users, posts, and comments
 - Average post / comment karma
 - Average length of membership

In [14]:
def get_basic_stats(df): 
    stats = df.lazy().select([
            pl.col("no_posts").count().alias("Total number of users"),
            pl.col('no_posts').sum().alias("Total posts"),    
            pl.col('no_comments').sum().alias("Total comments"),
            pl.col('post_karma').sum().alias("Total post karma"),
            pl.col('comment_karma').sum().alias("Total comment karma"),
            pl.col("no_posts").mean().alias("Avg. posts per user"),
            pl.col("no_posts").median().alias("Median posts per user"),    
            pl.col("no_comments").mean().alias("Avg. comments per user"),
            pl.col("no_comments").median().alias("Median comments per user"),    
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).median().alias("Median activity window (days)"),
            ((pl.col("last_date") - pl.col("first_date")) / 3600 / 24).mean().alias("Mean activity window (days)")
        ]).with_columns([
            (pl.col("Total post karma") / pl.col("Total posts")).mean().alias("Mean post karma"),
            (pl.col("Total comment karma") / pl.col("Total comments")).median().alias("Mean comment karma")
        ]).collect().transpose(include_header=True, header_name="Statistic")
    return stats

basic_stats_all = get_basic_stats(users)
basic_stats_selected = get_basic_stats(selected_users)

df1 = basic_stats_all.to_pandas().set_index("Statistic").rename({"column_0": "All Users"}, axis=1)
df2 = basic_stats_selected.to_pandas().set_index("Statistic").rename({"column_0": "Selected Users"}, axis=1)

joined_df = df1.join(df2).reset_index()

joined_df

Unnamed: 0,Statistic,All Users,Selected Users
0,Total number of users,814350.0,7807.0
1,Total posts,155184.0,28317.0
2,Total comments,5978678.0,1744481.0
3,Total post karma,76130300.0,16876910.0
4,Total comment karma,88031500.0,31238430.0
5,Avg. posts per user,0.1905618,3.627129
6,Median posts per user,0.0,1.0
7,Avg. comments per user,7.341657,223.4509
8,Median comments per user,2.0,154.0
9,Median activity window (days),0.0,125.0


In [15]:
lx.save_lx_table(
    filename=export_path + "basic_user_stats.tex",
    caption="Overall user statistics", 
    data=joined_df, headers=["Statistic", "All users", "Selected users"],
    precision= ".2f"
)

In [17]:
users.filter(pl.col("no_posts") >= 10).shape
users.filter(pl.col("no_comments") >= 10).shape

(111595, 11)

## Distributions

In [3]:
def get_histogram(df, column, bin_size):
    hist = df.select([
        (pl.col(column) / bin_size).floor().alias("bin_no"),
    ]).groupby("bin_no").count().select([
        (pl.col("bin_no") * bin_size + 1).alias("bin"),
        pl.col("count")
    ]).sort(pl.col("bin"))

    return hist.to_pandas()

def plot_histogram(hist, xlab = "", ylab="Number of users", y_log_scale=False, max_x=None):    
    
    if y_log_scale:
        y_scale = alt.Scale(type="log")
        hist['count'] = hist['count'] + 0.1        
    else:
        y_scale = alt.Scale(type="linear")

    
    if max_x is None:
        max_x = hist["bin"].max()
    chart = alt.Chart(hist).mark_area(size=2, clip=True).encode(
        x = alt.X("bin:Q", title=xlab, scale=alt.Scale(domain=(0,max_x))),
        y = alt.Y('count', title=ylab, scale=y_scale),
    )
    return chart

In [6]:
hist_posts = get_histogram(users, "no_posts", 10)
posts = plot_histogram(hist_posts, "Number of posts", y_log_scale=True, max_x=400).properties(title="Histogram of users by post count")

hist_comments = get_histogram(users, "no_comments", 5)
comments = plot_histogram(hist_comments, "Number of comments", y_log_scale=True, max_x=4000).properties(title="Histogram of users by comment count")

hist_age = get_histogram(users, "activity_window", 10)
age = plot_histogram(hist_age, "Activity window (days)", y_log_scale=True).properties(title="Histogram of users by activity window")

posts & comments & age

In [179]:
hist_post_karma = get_histogram(users.filter(pl.col("no_posts") > 0), "avg_post_karma", 5)
posts = plot_histogram(hist_post_karma, "Average post karma", y_log_scale=True, max_x=100000).properties(title="Histogram of users by post karma")

hist_comment_karma = get_histogram(users.filter(pl.col("no_comments") > 0), "avg_comment_karma", 1)
comments = plot_histogram(hist_comment_karma, "Average comment_karma", y_log_scale=True, max_x=10000).properties(title="Histogram of users by comment karma")

posts | comments

In [96]:
alt.Chart(top5000.filter((pl.col("no_posts") > 0)).to_pandas()).mark_point().encode(
    x=alt.X("no_posts:Q", scale=alt.Scale(type="log")),
    y=alt.Y("avg_post_karma", scale=alt.Scale(type="linear")),
) | alt.Chart(top5000.filter((pl.col("no_comments") > 0)).to_pandas()).mark_point().encode(
    x=alt.X("no_comments:Q", scale=alt.Scale(type="log")),
    y=alt.Y("avg_comment_karma", scale=alt.Scale(type="linear")),
)

In [97]:
alt.Chart(top5000.filter((pl.col("no_comments") > 0) & (pl.col("no_posts") > 0)).to_pandas()).mark_point().encode(
    x=alt.X("no_posts:Q", scale=alt.Scale(type="log")),
    y=alt.Y("no_comments:Q", scale=alt.Scale(type="log")),
) | alt.Chart(top5000.filter((pl.col("avg_post_karma") > 0) & (pl.col("avg_comment_karma") > 0)).to_pandas()).mark_point().encode(
    x=alt.X("avg_post_karma:Q", scale=alt.Scale(type="log")),
    y=alt.Y("avg_comment_karma:Q", scale=alt.Scale(type="log")),
)

In [129]:
sel_col_extr = users.filter((pl.col("no_comments") > 0) & (pl.col("no_posts") > 0)).select(['avg_post_karma', 'avg_comment_karma', 'no_posts', 'no_comments']).to_numpy()
corr_mat = np.corrcoef(sel_col_extr, rowvar=False)
print("Correlation between number of avg post karma and comment karma: {:.3f}".format(corr_mat[1,0]))
print("Correlation between number of posts and comments: {:.3f}".format(corr_mat[2,3]))

Correlation between number of avg post karma and comment karma: 0.189
Correlation between number of posts and comments: 0.276


## Active users
Note - this is not exactly active users as defined by tech companies which usually use a time window (e.g. 30-day active users). This uses lifetime activity instead. As a result, the last 90 days are excluded.

In [175]:
start_date = datetime.date.fromtimestamp(users['first_date'].min())
end_date = datetime.date.fromtimestamp(users['last_date'].max())

results = []

for i in range((end_date - start_date).days - 90):
    c_date = start_date + datetime.timedelta(days=i)
    c_timestamp = c_date.strftime("%s")
    no_users = users.filter((pl.col("first_date") <= c_timestamp) & (pl.col("last_date") >= c_timestamp)).shape[0]
    results.append((c_date, no_users))

In [176]:
active_users = pd.DataFrame(results, columns=['date', 'no_users'])
active_users['date'] = pd.to_datetime(active_users['date'])

alt.Chart(active_users).mark_area().encode(
    x="date",
    y="no_users"    
)

In [187]:
#90 day active users based on latest information
results = []
for i in range(30,91):
    actives = users.filter(pl.col("last_date") >= (end_date - datetime.timedelta(days=i)).strftime("%s")).shape[0]
    results.append((i, actives))


alt.Chart(pd.DataFrame(results, columns=['days', "no_users"])).mark_line().encode(
    x="days",
    y="no_users"    
)
