In [20]:
import polars as pl
import datetime
import time
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd


sub_ids = "../../../data/ids/all_ids.csv"
recent_posts = '../../../data/users/leaders/post_ids.csv'
recent_posts_with_metadata = '../../../data/users/leaders/all_recent_posts.csv'
user_metadata = '../../../data/users/summaries/combined/user_stats-cutoff.csv'

## Finding the ids of the most recent posts to be used for analysis

In [None]:
start_timestamp = datetime.date(2022,1,10).strftime("%s")
end_timestamp = datetime.date(2022,1,17).strftime("%s")


cut_offs = pl.read_csv(sub_ids).filter(
    (pl.col("created_utc") >= start_timestamp) & 
    (pl.col("created_utc") <= end_timestamp)
)

cut_offs.to_csv(recent_posts)
print("Max batch ID to be used: {}".format(int(cut_offs[:,0].min()  / 20)))

## Obtaining user information that is aligned to the cut-off period
Previously scraped user information is reprocessed to collect only the data up until the starting period. 

See Notebook `collect-users-cutoff`. Combined information is stored under `data/users/combined/user_stats-cutoff.csv`.

## Obtaining post authors, scores and num comments
Additional scraping run is done to collect meta data of the posts selected for analysis. 

Done over the 3 VMs, see `runner-post-metadata` and `GetUserNames`. Concatenated with below.

In [None]:
!f='recent-posts-batch-100-*.csv'
!awk '(NR == 1) || (FNR > 1)' $f > 'all_recent_posts.csv'

## Combining all information into a dataset for regression

In [4]:
posts = pl.read_csv(recent_posts_with_metadata)

In [31]:
posts = pl.read_csv(recent_posts_with_metadata)
topics = pl.read_csv("../../../data/submissions/raw/50th_labels.csv").select(["topic", "id"])
posts = posts.join(topics, left_on="post_id", right_on="id", how="left")

In [11]:

users = pl.read_csv(user_metadata).filter((pl.col("user_name") != "__SKIP__") &  (pl.col("user_name") != "AutoModerator") &  (pl.col("user_name") != "MAGIC_EYE_BOT")).with_columns([
            (pl.col("post_karma") / pl.col("no_posts")).alias("avg_post_karma"),
            (pl.col("no_posts") + pl.col("no_comments")).alias("total_activity")
        ]).with_columns([
            ((1641790800 - pl.col("first_date")) / 3600 / 24).alias("longevity"),
            (pl.col("post_karma").alias("total_post_karma"))            
        ]).select(
            ['user_name', 'no_posts', 'no_comments', 'avg_post_karma', 'total_post_karma', 'longevity']
        )

In [12]:
# TODO: investigate why missing some users - seems to be some sort of cut-off issue
joined = posts.join(users, left_on="user_name", right_on="user_name", how="left")
not_joined = joined.filter(pl.col("longevity").is_null()).select("user_name")
all_users = pl.read_csv('../../../data/users/summaries/combined/user_stats.csv')

datetime.date.fromtimestamp(not_joined.join(all_users, left_on="user_name", right_on="user_name")['first_date'].min())

datetime.date(2021, 12, 31)

## Regression analysis

In [32]:
joined = posts.join(users, left_on="user_name", right_on="user_name", how="inner")
print("Number of observations: {}".format(joined.shape[0]))
print("Number of unique users: {}".format(joined['user_name'].unique().shape[0]))
joined.head()

Number of observations: 5527
Number of unique users: 4492


post_id,post_karma,user_name,num_comments,topic,no_posts,no_comments,avg_post_karma,total_post_karma,longevity
str,i64,str,i64,i64,i64,i64,f64,i64,f64
"""s273cc""",4969,"""Sehtriom""",79,,64,2935,1203.359375,77015,1264.041667
"""s3zhnc""",1,"""DirtyPenPalDoug""",1,,24,2371,49.666667,1192,353.291667
"""s0js3e""",7,"""gregsw2000""",2,,13,2351,389.384615,5062,155.208333
"""s4ui1w""",22,"""gregsw2000""",5,,13,2351,389.384615,5062,155.208333
"""s40trg""",113,"""gregsw2000""",39,,13,2351,389.384615,5062,155.208333


In [5]:
joined.to_pandas().corr()

Unnamed: 0,post_karma,num_comments,no_posts,no_comments,avg_post_karma,total_post_karma,longevity
post_karma,1.0,0.753766,-0.015607,0.006115,0.044705,0.051449,0.002469
num_comments,0.753766,1.0,-0.024714,-0.002818,0.028891,0.026067,0.013726
no_posts,-0.015607,-0.024714,1.0,0.0762,-0.025056,0.348973,0.088888
no_comments,0.006115,-0.002818,0.0762,1.0,0.014659,0.127688,0.257652
avg_post_karma,0.044705,0.028891,-0.025056,0.014659,1.0,0.300644,0.007493
total_post_karma,0.051449,0.026067,0.348973,0.127688,0.300644,1.0,0.217069
longevity,0.002469,0.013726,0.088888,0.257652,0.007493,0.217069,1.0


## Different model specifications
### Simple linear model as per Kilgo et al.
`'post_karma ~ total_post_karma + longevity'`

In [42]:
data = joined.to_pandas().drop(["post_id", "user_name"], axis=1)
data['topic'] = data['topic'].astype("category")
data = pd.get_dummies(data)
topic_cols = {"topic_" + str(float(i)) : "topic_" + str(i) for i in range(25)}

In [46]:
data = data.rename(topic_cols, axis=1)

In [7]:
formula = 'post_karma ~ total_post_karma + longevity'

results = smf.ols(formula, data=joined.to_pandas()).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             post_karma   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     7.551
Date:                Thu, 24 Mar 2022   Prob (F-statistic):           0.000531
Time:                        16:45:40   Log-Likelihood:                -53520.
No. Observations:                5527   AIC:                         1.070e+05
Df Residuals:                    5524   BIC:                         1.071e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept          525.5993     62.384  

### Linear model with additional features

`'post_karma ~ no_posts + no_comments + avg_post_karma + total_post_karma + longevity'`

In [8]:
formula = 'post_karma ~ no_posts + no_comments + avg_post_karma + total_post_karma + longevity'
results = smf.ols(formula, data=joined.to_pandas()).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             post_karma   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     4.115
Date:                Thu, 24 Mar 2022   Prob (F-statistic):           0.000998
Time:                        16:46:25   Log-Likelihood:                -32246.
No. Observations:                3309   AIC:                         6.450e+04
Df Residuals:                    3303   BIC:                         6.454e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept          591.6715     91.181  

## Log-based specification

`'log(post_karma) ~ log(no_posts) + log(no_comments) + avg_post_karma + log(total_post_kmarma) + log(longevity)'`

In [48]:
log_joined = data.copy()

columns_to_log = ['post_karma', 'no_posts', 'no_comments', 'total_post_karma', 'longevity']

for c in columns_to_log:
    log_joined[c] = np.log(log_joined[c].values + 0.001)


formula = 'post_karma ~ no_posts + no_comments + avg_post_karma + total_post_karma + longevity +'
formula = formula + " + ".join(topic_cols.values())

results = smf.ols(formula, data=log_joined).fit()
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:             post_karma   R-squared:                       0.116
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     12.84
Date:                Sat, 16 Apr 2022   Prob (F-statistic):           5.92e-59
Time:                        21:46:30   Log-Likelihood:                -7173.5
No. Observations:                2966   AIC:                         1.441e+04
Df Residuals:                    2935   BIC:                         1.459e+04
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            1.3504      0.238  

  log_joined[c] = np.log(log_joined[c].values + 0.001)


In [49]:
results.summary()

0,1,2,3
Dep. Variable:,post_karma,R-squared:,0.116
Model:,OLS,Adj. R-squared:,0.107
Method:,Least Squares,F-statistic:,12.84
Date:,"Sat, 16 Apr 2022",Prob (F-statistic):,5.92e-59
Time:,21:46:37,Log-Likelihood:,-7173.5
No. Observations:,2966,AIC:,14410.0
Df Residuals:,2935,BIC:,14590.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.3504,0.238,5.671,0.000,0.883,1.817
no_posts,-0.4461,0.046,-9.727,0.000,-0.536,-0.356
no_comments,0.0395,0.018,2.214,0.027,0.005,0.075
avg_post_karma,-1.925e-06,2.29e-05,-0.084,0.933,-4.69e-05,4.3e-05
total_post_karma,0.3034,0.024,12.640,0.000,0.256,0.350
longevity,0.0514,0.057,0.895,0.371,-0.061,0.164
topic_0,-1.0291,0.413,-2.490,0.013,-1.840,-0.219
topic_1,-0.3159,0.294,-1.073,0.283,-0.893,0.261
topic_2,-0.1641,0.318,-0.516,0.606,-0.788,0.459

0,1,2,3
Omnibus:,615.052,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2549.926
Skew:,-0.957,Prob(JB):,0.0
Kurtosis:,7.119,Cond. No.,42200.0
