# Creating a list of unique users with their activity statistics

This notebook scans through all collected user activity files and produces user-level statistics:
 - number of posts
 - number of comments
 - total post karma
 - total comment karma
 - first and last date

The data produced is stored in a SQLite database under `/data/users/` (table users) and used for many downstream tasks.

In [11]:
import polars as pl
from pathlib import Path
import datetime
import time
import sqlite3
from tqdm import tqdm
import pandas as pd


In [13]:
path = "../../../data/users/"
files = [f.absolute() for f in Path(path).joinpath("raw/").glob("*.csv")]
user_stats = path + 'users.sqlite.db'

print("Total files to process: {}".format(len(files)))

Total files to process: 489


In [9]:
cols_to_read = ["user_name", "no_posts", "no_comments", "post_karma", "comment_karma", "first_date", "last_date"] #skip the large json columns

def get_file(f):
    return pl.scan_csv(f).select(cols_to_read).with_columns([
        pl.col("first_date").apply(lambda x: int(time.mktime(datetime.date.fromisoformat(x).timetuple()))),
        pl.col("last_date").apply(lambda x: int(time.mktime(datetime.date.fromisoformat(x).timetuple())))
    ]).collect()

#read in the first file to serve as the baseline
df = get_file(files[0])

#read in all other files and groupby / add up statistics after each file
for f in tqdm(files[1:]):    
    new_df = get_file(f)
    df.vstack(new_df, in_place=True)
    df = df.lazy().groupby("user_name").agg([
        pl.col("no_posts").sum(),
        pl.col("no_comments").sum(),
        pl.col("post_karma").sum(),
        pl.col("comment_karma").sum(),
        pl.col("first_date").min(),
        pl.col("last_date").max()
    ]).collect()    

100%|██████████| 488/488 [30:37<00:00,  3.77s/it]  


In [10]:
#add an extra column that captures total activity
df = df.with_columns([(pl.col("no_posts") + pl.col("no_comments")).alias("total_activity")])

In [14]:
conn = sqlite3.connect(user_stats)
df.to_pandas().to_sql("users", conn, index=False)

814353