In [1]:
# data manipulation imports
import numpy as np
import pandas as pd

# data saving imports
import pickle
import os

# custom imports
from regression_class import LogisticRegression as logreg
from regression_class import TimestampClass

In [2]:
# infiles
regression_infile = "regression_thread_data.p"
thread_infile = 'clean_5_thread_data.p'

# outfiles
outdir = 'linear_regression_test'
metrics_outfile = "regression_metrics"

In [3]:
# params
X_COLS = [
    'sentiment_sign', 'sentiment_magnitude', 'hour', 'num_dayofweek','activity_ratio',
    'mean_author_sentiment_sign', 'mean_author_sentiment_magnitude', 'log_author_all_activity_count',
    ]

# regression params dict
regression_params = {
    'collection_window': 7,
    'validation_window': 7,
    'FSS': True,
    'performance_scoring_method': 'roc_auc',
    'x_cols': X_COLS,
    'y_col': 'success',
    'metrics': ['roc_auc', 'aic'],
    #'activity_threshold': 2,
}

In [4]:
# make out params df to save to spreadsheet
out_params = {}
out_params['regression_infile'] = regression_infile
out_params['thread_infile'] = thread_infile

# make out directory
if not os.path.isdir(outdir):
    os.mkdir(outdir)


In [5]:
# read in files
regression_df = pickle.load(open(regression_infile, 'rb'))
thread_df = pickle.load(open(thread_infile, 'rb'))

In [6]:
subreddit = 'crypto'
regression_params['name'] = subreddit
regression_params['regression_data'] = regression_df[subreddit]
regression_params['thread_data'] = thread_df[subreddit]

logreg = logreg(regression_params)

In [7]:
logreg.calc_author_thread_counts()
logreg.period_counter = 1
date_index=0
logreg.model_data[1] = {}
logreg.get_regression_model_data(date_index)

In [8]:
y_cols = ['thread_size', 'authors']
X_cols = ['sentiment_sign', 'sentiment_magnitude', 'activity_ratio', 'log_author_all_activity_count',
       'mean_author_sentiment_sign', 'mean_author_sentiment_magnitude',
       'num_dayofweek', 'hour']

In [9]:
# https://scikit-learn.org/stable/modules/linear_model.html
# https://realpython.com/linear-regression-in-python/

In [10]:
regression_data = logreg.regression_model_data

In [11]:
# get logs of y cols as pretty sure need continuous dependent var
for col in y_cols:
    regression_data[f'log_{col}'] = np.log(regression_data[col])

In [12]:
y_cols = [f'log_{x}' for x in y_cols]

In [13]:
import statsmodels.api as sm

In [14]:
#X = regression_data[X_cols]
X = regression_data[['log_author_all_activity_count']]

In [15]:

y0 = regression_data[y_cols[0]]
y1 = regression_data[y_cols[1]]

In [16]:
fit0 = sm.OLS(y0, X).fit()

In [17]:
fit0_with_cst = sm.OLS(y0, sm.add_constant(X)).fit()

In [18]:
from patsy import dmatrices

In [46]:
patsy_y, patsy_X = dmatrices(
    'log_thread_size ~ log_author_all_activity_count',
    data = regression_data,
    return_type='dataframe'
)

In [47]:
patsy_model_with_cst = sm.OLS(patsy_y, patsy_X).fit()


In [48]:
patsy_y, patsy_X = dmatrices(
    'log_thread_size ~ log_author_all_activity_count - 1',
    data = regression_data,
    return_type='dataframe'
)
patsy_model_no_cst = sm.OLS(patsy_y, patsy_X).fit()

In [20]:
from sklearn.linear_model import LinearRegression

In [53]:
y0

0       7.809947
1       0.000000
2       0.000000
3       0.000000
4       0.693147
          ...   
7514    0.000000
7515    0.000000
7516    1.945910
7517    0.000000
7518    0.000000
Name: log_thread_size, Length: 7519, dtype: float64

In [54]:
sklearn_model_with_intercept = LinearRegression().fit(X, y0)
sklearn_model_no_intercept = LinearRegression(fit_intercept=False).fit(X, y0)

In [21]:
X = regression_data[X_cols]
FSS_results = logreg.forward_sequential_selection(X, y0, scoring_method='r2', model=LinearRegression())

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [23]:
FSS_results['metric_df']

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)",[0.23232736025623224],0.232327,"(log_author_all_activity_count,)",,0.0,
2,"(2, 3)",[0.238257205277428],0.238257,"(activity_ratio, log_author_all_activity_count)",,0.0,
3,"(1, 2, 3)",[0.2391542461462911],0.239154,"(sentiment_magnitude, activity_ratio, log_auth...",,0.0,
4,"(0, 1, 2, 3)",[0.24003966017462175],0.24004,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,
5,"(0, 1, 2, 3, 5)",[0.24070218595125026],0.240702,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,
6,"(0, 1, 2, 3, 4, 5)",[0.24092014688115115],0.24092,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,
7,"(0, 1, 2, 3, 4, 5, 6)",[0.24103418260071263],0.241034,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,
8,"(0, 1, 2, 3, 4, 5, 6, 7)",[0.24104111258858574],0.241041,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,


In [27]:
with open(f'{outdir}/test_outfile.txt', 'w') as outfile:
    outfile.write(str(fit0.summary()))

In [63]:
for sklearn_mod in [sklearn_model_no_intercept, sklearn_model_with_intercept]:
    print(f'coef: {sklearn_mod.coef_}')
    print(f'intercept: {sklearn_mod.intercept_}')
    print(f'R square: {sklearn_mod.score(X, y0)}')

coef: [0.54884739]
intercept: 0.0
R square: 0.16941983430452046
coef: [0.40543411]
intercept: 0.5960929965751969
R square: 0.23232736025623224


In [66]:
import sklearn.metrics as sklm

In [67]:
y_predicted = sklearn_model_no_intercept.predict(X)
r_2 = sklm.r2_score(y0, y_predicted) 

In [68]:
r_2

0.16941983430452046

In [70]:
fit0.rsquared

0.4879597115850822

In [69]:
names = ['sm OLS', 'patsy OLS no cst', 'sm OLS with cst', 'sm OLS with patsy matrices with cst']
for i, model in enumerate([fit0, patsy_model_no_cst, fit0_with_cst, patsy_model_with_cst]):
    print(names[i])
    display(model.summary())
    print('r squared')
    print(sklm.r2_score(y0, model.predict()))

sm OLS


0,1,2,3
Dep. Variable:,log_thread_size,R-squared (uncentered):,0.488
Model:,OLS,Adj. R-squared (uncentered):,0.488
Method:,Least Squares,F-statistic:,7164.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,18:06:34,Log-Likelihood:,-14149.0
No. Observations:,7519,AIC:,28300.0
Df Residuals:,7518,BIC:,28310.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
log_author_all_activity_count,0.5488,0.006,84.643,0.000,0.536,0.562

0,1,2,3
Omnibus:,802.455,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1420.279
Skew:,0.73,Prob(JB):,3.89e-309
Kurtosis:,4.549,Cond. No.,1.0


r squared
0.16941983430452046
patsy OLS no cst


0,1,2,3
Dep. Variable:,log_thread_size,R-squared (uncentered):,0.488
Model:,OLS,Adj. R-squared (uncentered):,0.488
Method:,Least Squares,F-statistic:,7164.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,18:06:34,Log-Likelihood:,-14149.0
No. Observations:,7519,AIC:,28300.0
Df Residuals:,7518,BIC:,28310.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
log_author_all_activity_count,0.5488,0.006,84.643,0.000,0.536,0.562

0,1,2,3
Omnibus:,802.455,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1420.279
Skew:,0.73,Prob(JB):,3.89e-309
Kurtosis:,4.549,Cond. No.,1.0


r squared
0.16941983430452046
sm OLS with cst


0,1,2,3
Dep. Variable:,log_thread_size,R-squared:,0.232
Model:,OLS,Adj. R-squared:,0.232
Method:,Least Squares,F-statistic:,2275.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,18:06:34,Log-Likelihood:,-13853.0
No. Observations:,7519,AIC:,27710.0
Df Residuals:,7517,BIC:,27720.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5961,0.024,24.819,0.000,0.549,0.643
log_author_all_activity_count,0.4054,0.009,47.696,0.000,0.389,0.422

0,1,2,3
Omnibus:,1022.533,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1597.378
Skew:,0.958,Prob(JB):,0.0
Kurtosis:,4.195,Cond. No.,4.09


r squared
0.23232736025623224
sm OLS with patsy matrices with cst


0,1,2,3
Dep. Variable:,log_thread_size,R-squared:,0.232
Model:,OLS,Adj. R-squared:,0.232
Method:,Least Squares,F-statistic:,2275.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,18:06:34,Log-Likelihood:,-13853.0
No. Observations:,7519,AIC:,27710.0
Df Residuals:,7517,BIC:,27720.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5961,0.024,24.819,0.000,0.549,0.643
log_author_all_activity_count,0.4054,0.009,47.696,0.000,0.389,0.422

0,1,2,3
Omnibus:,1022.533,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1597.378
Skew:,0.958,Prob(JB):,0.0
Kurtosis:,4.195,Cond. No.,4.09


r squared
0.23232736025623224
