In [1]:
# data manipulation imports
import numpy as np
import pandas as pd

# data saving imports
import pickle
import os

# custom imports
from regression_class import LogisticRegression as logreg
from regression_class import TimestampClass

In [2]:
# infiles
regression_infile = "regression_thread_data.p"
thread_infile = 'clean_5_thread_data.p'

# outfiles
outdir = 'linear_regression_test'
metrics_outfile = "regression_metrics"

In [3]:
# params
X_COLS = [
    'sentiment_sign', 'sentiment_magnitude', 'hour', 'num_dayofweek','activity_ratio',
    'mean_author_sentiment_sign', 'mean_author_sentiment_magnitude', 'log_author_all_activity_count',
    ]

# regression params dict
regression_params = {
    'collection_window': 7,
    'validation_window': 7,
    'FSS': True,
    'performance_scoring_method': 'roc_auc',
    'x_cols': X_COLS,
    'y_col': 'success',
    'metrics': ['roc_auc', 'aic'],
    #'activity_threshold': 2,
}

In [4]:
# make out params df to save to spreadsheet
out_params = {}
out_params['regression_infile'] = regression_infile
out_params['thread_infile'] = thread_infile

# make out directory
if not os.path.isdir(outdir):
    os.mkdir(outdir)


In [5]:
# read in files
regression_df = pickle.load(open(regression_infile, 'rb'))
thread_df = pickle.load(open(thread_infile, 'rb'))

In [7]:
subreddit = 'crypto'
regression_params['name'] = subreddit
regression_params['regression_data'] = regression_df[subreddit]
regression_params['thread_data'] = thread_df[subreddit]

logreg = logreg(regression_params)

In [8]:
logreg.calc_author_thread_counts()
logreg.period_counter = 1
date_index=0
logreg.model_data[1] = {}
logreg.get_regression_model_data(date_index)

In [9]:
logreg.regression_model_data

Unnamed: 0,thread_id,thread_size,authors,timestamp,author,score,subject_sentiment_score,sentiment_sign,sentiment_magnitude,success,activity_ratio,log_author_all_activity_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude,num_dayofweek,hour
0,xye4t1,2465,461,2022-10-08 00:00:12,8ac426fd80f0ea3761bdcd7f32591b09ce1c1366e59ef15c,1,0.0000,0.0,0.0000,1,-1.000000,2.079442,0.0,0.000000,5,0
1,xye75w,1,1,2022-10-08 00:03:05,5feacb776c7511b810b777cb265908e8a581dc7c386ef863,1,0.0000,0.0,0.0000,0,1.000000,1.098612,1.0,0.065750,5,0
2,xye7jc,1,1,2022-10-08 00:03:35,e5196175bfa86393fe5afccfdde933fcda5157e5179fe999,1,-0.4215,-1.0,0.4215,0,0.166667,2.564949,1.0,0.155407,5,0
3,xye7ld,1,1,2022-10-08 00:03:40,027c79d57adefc2fbeedef8d48e8103595d5e004ef7c868f,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,5,0
4,xye83c,2,2,2022-10-08 00:04:24,441b7c85ea0cabf086a5374b5a1ef0246555e72c9e583f3d,1,0.0000,0.0,0.0000,1,0.000000,0.000000,0.0,0.000000,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7514,ybwfba,1,1,2022-10-23 23:41:58,87cc13a23209e8bd6be34f3e2cee0bcb489f4f18e10d6c49,1,0.4588,1.0,0.4588,0,0.000000,0.000000,0.0,0.000000,6,23
7515,ybwkye,1,1,2022-10-23 23:49:20,a5af040923969a80e3e8eae621a19e68b9516c823f5c0bae,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,6,23
7516,ybwm9w,7,7,2022-10-23 23:51:03,9b442484e60a79ad633aad52d556afbb4a77f8028a54360a,1,0.0000,0.0,0.0000,1,0.822222,4.912655,1.0,0.017225,6,23
7517,ybwoie,1,1,2022-10-23 23:54:05,846ec1b656f9d6fa3ccacb8a9ea22add11dcaca6493a0854,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,6,23


In [10]:
logreg.regression_model_data.columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'activity_ratio', 'log_author_all_activity_count',
       'mean_author_sentiment_sign', 'mean_author_sentiment_magnitude',
       'num_dayofweek', 'hour'],
      dtype='object')

In [11]:
y_cols = ['thread_size', 'authors']
X_cols = ['sentiment_sign', 'sentiment_magnitude', 'activity_ratio', 'log_author_all_activity_count',
       'mean_author_sentiment_sign', 'mean_author_sentiment_magnitude',
       'num_dayofweek', 'hour']

In [13]:
# https://scikit-learn.org/stable/modules/linear_model.html
# https://realpython.com/linear-regression-in-python/

In [14]:
regression_data = logreg.regression_model_data

In [16]:
# get logs of y cols as pretty sure need continuous dependent var
for col in y_cols:
    regression_data[f'log_{col}'] = np.log(regression_data[col])

In [17]:
y_cols = [f'log_{x}' for x in y_cols]

In [20]:
import statsmodels.api as sm

In [26]:
regression_data

Unnamed: 0,thread_id,thread_size,authors,timestamp,author,score,subject_sentiment_score,sentiment_sign,sentiment_magnitude,success,activity_ratio,log_author_all_activity_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude,num_dayofweek,hour,log_thread_size,log_authors
0,xye4t1,2465,461,2022-10-08 00:00:12,8ac426fd80f0ea3761bdcd7f32591b09ce1c1366e59ef15c,1,0.0000,0.0,0.0000,1,-1.000000,2.079442,0.0,0.000000,5,0,7.809947,6.133398
1,xye75w,1,1,2022-10-08 00:03:05,5feacb776c7511b810b777cb265908e8a581dc7c386ef863,1,0.0000,0.0,0.0000,0,1.000000,1.098612,1.0,0.065750,5,0,0.000000,0.000000
2,xye7jc,1,1,2022-10-08 00:03:35,e5196175bfa86393fe5afccfdde933fcda5157e5179fe999,1,-0.4215,-1.0,0.4215,0,0.166667,2.564949,1.0,0.155407,5,0,0.000000,0.000000
3,xye7ld,1,1,2022-10-08 00:03:40,027c79d57adefc2fbeedef8d48e8103595d5e004ef7c868f,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,5,0,0.000000,0.000000
4,xye83c,2,2,2022-10-08 00:04:24,441b7c85ea0cabf086a5374b5a1ef0246555e72c9e583f3d,1,0.0000,0.0,0.0000,1,0.000000,0.000000,0.0,0.000000,5,0,0.693147,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7514,ybwfba,1,1,2022-10-23 23:41:58,87cc13a23209e8bd6be34f3e2cee0bcb489f4f18e10d6c49,1,0.4588,1.0,0.4588,0,0.000000,0.000000,0.0,0.000000,6,23,0.000000,0.000000
7515,ybwkye,1,1,2022-10-23 23:49:20,a5af040923969a80e3e8eae621a19e68b9516c823f5c0bae,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,6,23,0.000000,0.000000
7516,ybwm9w,7,7,2022-10-23 23:51:03,9b442484e60a79ad633aad52d556afbb4a77f8028a54360a,1,0.0000,0.0,0.0000,1,0.822222,4.912655,1.0,0.017225,6,23,1.945910,1.945910
7517,ybwoie,1,1,2022-10-23 23:54:05,846ec1b656f9d6fa3ccacb8a9ea22add11dcaca6493a0854,1,0.0000,0.0,0.0000,0,0.000000,0.000000,0.0,0.000000,6,23,0.000000,0.000000


In [29]:
X = regression_data[X_cols]

In [30]:

y0 = regression_data[y_cols[0]]
y1 = regression_data[y_cols[1]]

In [31]:
y0

0       7.809947
1       0.000000
2       0.000000
3       0.000000
4       0.693147
          ...   
7514    0.000000
7515    0.000000
7516    1.945910
7517    0.000000
7518    0.000000
Name: log_thread_size, Length: 7519, dtype: float64

In [32]:
X

Unnamed: 0,sentiment_sign,sentiment_magnitude,activity_ratio,log_author_all_activity_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude,num_dayofweek,hour
0,0.0,0.0000,-1.000000,2.079442,0.0,0.000000,5,0
1,0.0,0.0000,1.000000,1.098612,1.0,0.065750,5,0
2,-1.0,0.4215,0.166667,2.564949,1.0,0.155407,5,0
3,0.0,0.0000,0.000000,0.000000,0.0,0.000000,5,0
4,0.0,0.0000,0.000000,0.000000,0.0,0.000000,5,0
...,...,...,...,...,...,...,...,...
7514,1.0,0.4588,0.000000,0.000000,0.0,0.000000,6,23
7515,0.0,0.0000,0.000000,0.000000,0.0,0.000000,6,23
7516,0.0,0.0000,0.822222,4.912655,1.0,0.017225,6,23
7517,0.0,0.0000,0.000000,0.000000,0.0,0.000000,6,23


In [35]:
fit0 = sm.OLS(y0, X).fit()

In [41]:
fit0_with_cst = sm.OLS(y0, sm.add_constant(X)).fit()

In [39]:
fit0.summary()

0,1,2,3
Dep. Variable:,log_thread_size,R-squared (uncentered):,0.526
Model:,OLS,Adj. R-squared (uncentered):,0.525
Method:,Least Squares,F-statistic:,1041.0
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,0.0
Time:,14:55:45,Log-Likelihood:,-13861.0
No. Observations:,7519,AIC:,27740.0
Df Residuals:,7511,BIC:,27790.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sentiment_sign,-0.0641,0.026,-2.478,0.013,-0.115,-0.013
sentiment_magnitude,0.4092,0.073,5.587,0.000,0.266,0.553
activity_ratio,0.2937,0.042,6.964,0.000,0.211,0.376
log_author_all_activity_count,0.3760,0.013,28.839,0.000,0.350,0.402
mean_author_sentiment_sign,-0.0571,0.038,-1.488,0.137,-0.132,0.018
mean_author_sentiment_magnitude,0.6590,0.177,3.724,0.000,0.312,1.006
num_dayofweek,0.0559,0.007,7.564,0.000,0.041,0.070
hour,0.0188,0.002,8.983,0.000,0.015,0.023

0,1,2,3
Omnibus:,1056.659,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1847.189
Skew:,0.925,Prob(JB):,0.0
Kurtosis:,4.573,Cond. No.,145.0


In [42]:
fit0_with_cst.summary()

0,1,2,3
Dep. Variable:,log_thread_size,R-squared:,0.241
Model:,OLS,Adj. R-squared:,0.24
Method:,Least Squares,F-statistic:,298.1
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,0.0
Time:,14:56:35,Log-Likelihood:,-13810.0
No. Observations:,7519,AIC:,27640.0
Df Residuals:,7510,BIC:,27700.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5227,0.052,10.090,0.000,0.421,0.624
sentiment_sign,-0.0770,0.026,-2.994,0.003,-0.127,-0.027
sentiment_magnitude,0.2368,0.075,3.168,0.002,0.090,0.383
activity_ratio,0.3234,0.042,7.701,0.000,0.241,0.406
log_author_all_activity_count,0.3465,0.013,26.100,0.000,0.320,0.373
mean_author_sentiment_sign,-0.0556,0.038,-1.459,0.145,-0.130,0.019
mean_author_sentiment_magnitude,0.5014,0.176,2.841,0.005,0.155,0.847
num_dayofweek,0.0092,0.009,1.063,0.288,-0.008,0.026
hour,0.0007,0.003,0.262,0.793,-0.005,0.006

0,1,2,3
Omnibus:,1084.376,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1810.011
Skew:,0.969,Prob(JB):,0.0
Kurtosis:,4.421,Cond. No.,146.0


In [43]:
from patsy import dmatrices

In [44]:
patsy_y, patsy_X = dmatrices(
    'log_thread_size ~ log_author_all_activity_count + activity_ratio',
    data = regression_data,
    return_type='dataframe'
)

In [45]:
patsy_model = sm.OLS(patsy_y, patsy_X).fit()
patsy_model.summary()

0,1,2,3
Dep. Variable:,log_thread_size,R-squared:,0.238
Model:,OLS,Adj. R-squared:,0.238
Method:,Least Squares,F-statistic:,1175.0
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,0.0
Time:,15:00:30,Log-Likelihood:,-13824.0
No. Observations:,7519,AIC:,27650.0
Df Residuals:,7516,BIC:,27670.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6071,0.024,25.328,0.000,0.560,0.654
log_author_all_activity_count,0.3478,0.011,30.686,0.000,0.326,0.370
activity_ratio,0.3193,0.042,7.649,0.000,0.237,0.401

0,1,2,3
Omnibus:,1080.57,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1786.662
Skew:,0.971,Prob(JB):,0.0
Kurtosis:,4.389,Cond. No.,7.15
