In [49]:
import datetime
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import polars as pl
import sqlite3

DATA_PATH = "../../data/users/"
recent_posts = DATA_PATH + "recent/recent-posts-batch-*.csv"
DB_PATH = DATA_PATH + "users.sqlite.db"

## Combining all information into a dataset for regression

In [None]:
posts = pl.scan_csv(recent_posts).collect()
conn = sqlite3.connect(DB_PATH)
posts.to_pandas().to_sql("posts", conn, index=False)

In [51]:
sql = """
SELECT 
    posts.post_id, posts.post_karma, posts.user_name,
    users_cutoff.no_posts, users_cutoff.no_comments, users_cutoff.avg_post_karma,
    users_cutoff.post_karma as total_post_karma, users_cutoff.longevity, 
    topics.topic
    FROM posts
    JOIN users_cutoff ON users_cutoff.user_name = posts.user_name
    LEFT JOIN topics ON topics.id = posts.post_id
    """

joined = pd.read_sql(sql, conn)

## Regression analysis

In [52]:
data = joined.drop(["post_id", "user_name"], axis=1)
data['topic'] = data['topic'].astype("category")
data = pd.get_dummies(data)
topic_cols = {"topic_" + str(float(i)) : "topic_" + str(i) for i in range(25)}
data = data.rename(topic_cols, axis=1)

## Log-based specification

`'log(post_karma) ~ log(no_posts) + log(no_comments) + avg_post_karma + log(total_post_kmarma) + log(longevity) + topic'`

In [None]:
formula = """np.log(post_karma + 0.001) ~ np.log(no_posts + 0.001) + 
    np.log(no_comments + 0.001) + avg_post_karma + 
    np.log(total_post_karma + 0.001) + np.log(longevity + 0.000)"""

formula = formula  + " + " +  " + ".join(topic_cols.values())

results = smf.ols(formula, data=log_joined).fit()
print(results.summary())

## Different model specifications
### Simple linear model as per Kilgo et al.
`'post_karma ~ total_post_karma + longevity'`

In [None]:
formula = 'post_karma ~ total_post_karma + longevity'

results = smf.ols(formula, data=data).fit()
print(results.summary())