# Creating a list of unique users with their activity statistics

This notebook scans through all collected user activity files and produces a single csv with user-level statistics:
 - number of posts
 - number of comments
 - total post karma
 - total comment karma
 - first and last date

The file produced is stored under `/data/users/summaries/user_stats.csv` and is sorted by total activity in descending order. Good in case want to take only x% most active users for downstream tasks.

In [1]:
import polars as pl
from pathlib import Path
import datetime
import time


In [2]:
path = "../../data/users/"
files = [f.absolute() for f in Path(path).glob("*.csv")]
user_stats = path + 'summaries/local/user_stats.csv'
    

In [3]:
cols_to_read = ["user_name", "no_posts", "no_comments", "post_karma", "comment_karma", "first_date", "last_date"] #skip the large json columns

def get_file(f):
    return pl.scan_csv(f).select(cols_to_read).with_columns([
        pl.col("first_date").apply(lambda x: int(time.mktime(datetime.date.fromisoformat(x).timetuple()))),
        pl.col("last_date").apply(lambda x: int(time.mktime(datetime.date.fromisoformat(x).timetuple())))
    ]).collect()


#read in the first file to serve as the baseline
df = get_file(files[0])

#read in all other files and groupby / add up statistics after each file
for i,f in enumerate(files[1:]):
    print("Processing file {} out of {}".format(i, len(files) - 1))
    new_df = get_file(f)
    df.vstack(new_df, in_place=True)
    df = df.lazy().groupby("user_name").agg([
        pl.col("no_posts").sum(),
        pl.col("no_comments").sum(),
        pl.col("post_karma").sum(),
        pl.col("comment_karma").sum(),
        pl.col("first_date").min(),
        pl.col("last_date").max()
    ]).collect()    

Processing file 0 out of 12
Processing file 1 out of 12
Processing file 2 out of 12
Processing file 3 out of 12
Processing file 4 out of 12
Processing file 5 out of 12
Processing file 6 out of 12
Processing file 7 out of 12
Processing file 8 out of 12
Processing file 9 out of 12
Processing file 10 out of 12
Processing file 11 out of 12


In [4]:
#add an extra column that captures total activity
df = df.with_columns([(pl.col("no_posts") + pl.col("no_comments")).alias("total_activity")])    

#save to a single CSV
df.lazy().sort("total_activity", reverse=True).collect().to_csv(user_stats)

In [5]:
#Check basic statistics on user activity levels
activity_levels = [2, 3, 5, 10, 20, 50, 100]
total_users = df.shape[0]

for level in activity_levels:
    no_users = df.filter((pl.col("total_activity") >= level)).shape[0]
    print("{} users ({:.1f}%) with at least {} activities in the dataset".format(no_users, no_users / total_users * 100, level))

27878 users (51.6%) with at least 2 activities in the dataset
19390 users (35.9%) with at least 3 activities in the dataset
12230 users (22.6%) with at least 5 activities in the dataset
6225 users (11.5%) with at least 10 activities in the dataset
2946 users (5.4%) with at least 20 activities in the dataset
946 users (1.7%) with at least 50 activities in the dataset
351 users (0.6%) with at least 100 activities in the dataset
