# Descriptive User Statistics
This notebook contains basic user statistics analysis included in the report.

In [3]:
import sys
sys.path.append("../../../src")

import polars as pl
import altair as alt
import latex_tables as lx
import pandas as pd
import importlib as imp
import sqlite3 as sq
imp.reload(lx)
from pathlib import Path

DB_path = '../../../data/users/users.sqlite.db'
export_path = '../../../data/users/illustrations/'

## Adding basic statistics about users directly to the database

In [4]:
with sq.connect(DB_path) as conn:
    cur = conn.cursor()    
    try:
        cur.execute("ALTER TABLE users ADD COLUMN avg_post_karma real")
        cur.execute("ALTER TABLE users ADD COLUMN avg_comment_karma real")
        cur.execute("ALTER TABLE users ADD COLUMN activity_window real")        
        cur.execute("ALTER TABLE users ADD COLUMN is_bot boolean")
        cur.execute("ALTER TABLE users ADD COLUMN is_selected boolean")    
    except sq.OperationalError:
        print("columns already exist")

    cur.execute("CREATE INDEX IF NOT EXISTS selected_index ON users(is_selected)")
    cur.execute("CREATE INDEX IF NOT EXISTS user_name_index ON users(user_name)")
    
    cur.execute("UPDATE users SET avg_post_karma = CASE WHEN no_posts > 0 THEN post_karma / CAST(no_posts as REAL) ELSE 0 END")
    cur.execute("UPDATE users SET avg_comment_karma = CASE WHEN no_comments > 0 THEN comment_karma / CAST(no_comments as REAL) ELSE 0 END")
    cur.execute("UPDATE users SET activity_window = CAST(last_date - first_date as REAL) / (3600 * 24)") 

columns already exist


## Marking bots

These users were identified as bots during exploratory analysis.

In [5]:
bots = ["__SKIP__", "AutoModerator", "MAGIC_EYE_BOT"]
with sq.connect(DB_path) as conn:
    cur = conn.cursor()
    cur.execute("UPDATE users SET is_bot = ?", (False,))
    for bot in bots:
        cur.execute("UPDATE users SET is_bot = ? WHERE user_name = ?", (True, bot))

## Deciding on the cut-off for "top users"

Based on the below curve and our computational resources, we decided to define "core community" as the users that have made at least 100 posts/comments during their lifetime.

In [6]:
conn_string = "sqlite://" + str(Path(DB_path).absolute())
users = pl.read_sql("SELECT total_activity FROM users WHERE is_bot = FALSE", conn_string)

In [7]:
subset_sizes = [(threshold, users.filter(pl.col("total_activity") >= threshold).shape[0]) for threshold in range(5, 151)]

alt.Chart(pd.DataFrame(subset_sizes, columns=["threshold", "no_users"])).mark_line().encode(
    x="threshold",
    y="no_users"
)

In [8]:
with sq.connect(DB_path) as conn:
    cur = conn.cursor()
    cur.execute("UPDATE users SET is_selected = ?", (False,))
    for bot in bots:
        cur.execute("UPDATE users SET is_selected = ? WHERE total_activity >= ? AND NOT is_bot", (True, 100))

## Basic stats
 - Number of users, posts, and comments
 - Average post / comment karma
 - Average length of membership

In [9]:
conn_string = "sqlite://" + str(Path(DB_path).absolute())
users = pl.read_sql("SELECT * FROM users WHERE is_bot = FALSE", conn_string)

In [10]:
def get_basic_stats(df): 
    stats = df.lazy().select([
            pl.col("no_posts").count().alias("Total number of users"),
            pl.col('no_posts').sum().alias("Total posts"),    
            pl.col('no_comments').sum().alias("Total comments"),
            pl.col('post_karma').sum().alias("Total post karma"),
            pl.col('comment_karma').sum().alias("Total comment karma"),
            pl.col("no_posts").mean().alias("Avg. posts per user"),
            pl.col("no_posts").median().alias("Median posts per user"),    
            pl.col("no_comments").mean().alias("Avg. comments per user"),
            pl.col("no_comments").median().alias("Median comments per user"),
            pl.col("avg_post_karma").mean().alias("Median average user post karma"),
            pl.col("avg_comment_karma").mean().alias("Median average user comment karma"),  
            pl.col("activity_window").median().alias("Median activity window (days)"),            
        ]).collect().transpose(include_header=True, header_name="Statistic")
    return stats

basic_stats_all = get_basic_stats(users)
basic_stats_selected = get_basic_stats(users.filter(pl.col("is_selected") == True))

df1 = basic_stats_all.to_pandas().set_index("Statistic").rename({"column_0": "All Users"}, axis=1)
df2 = basic_stats_selected.to_pandas().set_index("Statistic").rename({"column_0": "Selected Users"}, axis=1)

joined_df = df1.join(df2).reset_index()

joined_df

Unnamed: 0,Statistic,All Users,Selected Users
0,Total number of users,814350.0,7807.0
1,Total posts,155184.0,28317.0
2,Total comments,5978678.0,1744481.0
3,Total post karma,76130300.0,16876910.0
4,Total comment karma,88031500.0,31238430.0
5,Avg. posts per user,0.1905618,3.627129
6,Median posts per user,0.0,1.0
7,Avg. comments per user,7.341657,223.4509
8,Median comments per user,2.0,154.0
9,Median average user post karma,47.72883,390.9637


In [11]:
lx.save_lx_table(
    filename=export_path + "basic_user_stats.tex",
    caption="Overall user statistics", 
    data=joined_df, headers=["Statistic", "All users", "Selected users"],
    precision= ".2f"
)

## Distributions

In [12]:
def get_histogram(df, column, bin_size):
    hist = df.select([
        (pl.col(column) / bin_size).floor().alias("bin_no"),
    ]).groupby("bin_no").count().select([
        (pl.col("bin_no") * bin_size + 1).alias("bin"),
        pl.col("count")
    ]).sort(pl.col("bin"))

    return hist.to_pandas()

def plot_histogram(hist, xlab = "", ylab="Number of users", y_log_scale=False, max_x=None):    
    
    if y_log_scale:
        y_scale = alt.Scale(type="log")
        hist['count'] = hist['count'] + 0.1        
    else:
        y_scale = alt.Scale(type="linear")

    
    if max_x is None:
        max_x = hist["bin"].max()
    chart = alt.Chart(hist).mark_line(size=2, clip=True).encode(
        x = alt.X("bin:Q", title=xlab, scale=alt.Scale(domain=(0,max_x)), axis=alt.Axis(grid=False)),
        y = alt.Y('count', title=ylab, scale=y_scale, axis=alt.Axis(grid=False)),
        color = alt.Color("Variable")
    )
    return chart

In [13]:
hist_posts = get_histogram(users, "no_posts", 20)
hist_posts['Variable'] = "Post count"

posts = plot_histogram(
    hist_posts, "Number of posts", y_log_scale=True, max_x=400
).properties(title="Histogram of users by post count")

hist_comments = get_histogram(users, "no_comments", 100)
hist_comments['Variable'] = "Comment count"
comments = plot_histogram(
    hist_comments, "Number of comments", y_log_scale=True, max_x=4000
).properties(title="Histogram of users by comment count")

hist_age = get_histogram(users, "activity_window", 100)
hist_age['Variable'] = "Activity window"
age = plot_histogram(
    hist_age, "Activity window (days)", y_log_scale=True
).properties(
    title="Histogram of users by number of posts, comment and activity window"
)

distributions = (posts + comments + age)

In [14]:
distributions