# Combining all individual user statistics collected on multiple machines

This notebook simply combines `user_stats.csv` files collected on all machines into a single file.

In [13]:
import polars as pl
from pathlib import Path

path = '../../data/users/summaries/'
folders = ['local', 'aurimas.eu', 'vm1', 'vm2']
files = [Path(path + f + '/user_stats.csv').absolute() for f in folders]
all_stats = path + 'combined/user_stats.csv'

In [12]:
df = pl.read_csv(files[0])

for i,f in enumerate(files[1:]):
    print("Processing file {} out of {}".format(i, len(files) - 1))
    new_df = pl.read_csv(f)
    df.vstack(new_df, in_place=True)
    df = df.lazy().groupby("user_name").agg([
        pl.col("no_posts").sum(),
        pl.col("no_comments").sum(),
        pl.col("post_karma").sum(),
        pl.col("comment_karma").sum(),        
        pl.col("first_date").min(),
        pl.col("last_date").max(),
        pl.col("total_activity").sum()
    ]).collect()    

Processing file 0 out of 3
Processing file 1 out of 3
Processing file 2 out of 3


In [14]:
df.lazy().sort("total_activity", reverse=True).collect().to_csv(all_stats)

In [15]:
#Check basic statistics on user activity levels
activity_levels = [2, 3, 5, 10, 20, 50, 100]
total_users = df.shape[0]

for level in activity_levels:
    no_users = df.filter((pl.col("total_activity") >= level)).shape[0]
    print("{} users ({:.1f}%) with at least {} activities in the dataset".format(no_users, no_users / total_users * 100, level))

446523 users (54.8%) with at least 2 activities in the dataset
321805 users (39.5%) with at least 3 activities in the dataset
211508 users (26.0%) with at least 5 activities in the dataset
114475 users (14.1%) with at least 10 activities in the dataset
57813 users (7.1%) with at least 20 activities in the dataset
19926 users (2.4%) with at least 50 activities in the dataset
7810 users (1.0%) with at least 100 activities in the dataset
