In [1]:
import datetime
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import polars as pl
import sqlite3

DATA_PATH = "../../data/users/"
recent_posts = DATA_PATH + "recent/recent-posts-batch-*.csv"
DB_PATH = DATA_PATH + "users.sqlite.db"

## Combining all information into a dataset for regression

In [3]:
conn = sqlite3.connect(DB_PATH)

In [None]:
posts = pl.scan_csv(recent_posts).collect()
posts.to_pandas().to_sql("posts", conn, index=False)

In [16]:
sql = """
SELECT 
    posts.post_id, posts.post_karma, posts.user_name,
    users_cutoff.no_posts, users_cutoff.no_comments, users_cutoff.avg_post_karma,
    users_cutoff.post_karma as total_post_karma, users_cutoff.longevity, 
    users.indirect_pg,
    topics.topic
    FROM posts
    JOIN users_cutoff ON users_cutoff.user_name = posts.user_name
    JOIN users ON users.user_name = users_cutoff.user_name
    LEFT JOIN topics ON topics.id = posts.post_id
    """

joined = pd.read_sql(sql, conn)

## Regression analysis

In [17]:
data = joined.drop(["post_id", "user_name"], axis=1)
data['topic'] = data['topic'].astype("category")
data = pd.get_dummies(data)
topic_cols = {"topic_" + str(float(i)) : "topic_" + str(i) for i in range(25)}
data = data.rename(topic_cols, axis=1)

## Log-based specification

`'log(post_karma) ~ log(no_posts) + log(no_comments) + avg_post_karma + log(total_post_kmarma) + log(longevity) + topic'`

In [18]:
formula = """np.log(post_karma + 0.001) ~ np.log(no_posts + 0.001) + 
    np.log(no_comments + 0.001) + avg_post_karma + 
    np.log(total_post_karma + 0.001) + np.log(longevity + 0.000) + np.log(indirect_pg)"""

formula = formula  + " + " +  " + ".join(topic_cols.values())

results = smf.ols(formula, data=data).fit()
print(results.summary())

                                OLS Regression Results                                
Dep. Variable:     np.log(post_karma + 0.001)   R-squared:                       0.186
Model:                                    OLS   Adj. R-squared:                  0.161
Method:                         Least Squares   F-statistic:                     7.418
Date:                        Wed, 27 Apr 2022   Prob (F-statistic):           1.71e-28
Time:                                02:18:48   Log-Likelihood:                -2519.8
No. Observations:                        1041   AIC:                             5104.
Df Residuals:                            1009   BIC:                             5262.
Df Model:                                  31                                         
Covariance Type:                    nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Different model specifications
### Simple linear model as per Kilgo et al.
`'post_karma ~ total_post_karma + longevity'`

In [8]:
formula = 'post_karma ~ total_post_karma + longevity'

results = smf.ols(formula, data=data).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             post_karma   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     7.551
Date:                Wed, 27 Apr 2022   Prob (F-statistic):           0.000531
Time:                        02:12:17   Log-Likelihood:                -53520.
No. Observations:                5527   AIC:                         1.070e+05
Df Residuals:                    5524   BIC:                         1.071e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept          525.5993     62.384  