In [30]:
import pickle
import pandas as pd
from regression_class import RedditRegression as RR
from regression_class import TimestampClass
import numpy as np
import statsmodels.formula.api as smf

In [2]:
data = pickle.load(open('test_data_crypto_5_days.p', 'rb'))
regression_threads = data['regression_data']
thread_data = data['all_data']

In [3]:
def get_weekday(value):
    weekday = value.weekday()
    return weekday

def get_weekend_or_weekday(value):
    weekday = value.weekday()
    if weekday < 5:
        return "Weekday"
    else:
        return "Weekend"

In [4]:
weekday_or_weekend = regression_threads.timestamp.apply(get_weekend_or_weekday)
category_map = pd.get_dummies(weekday_or_weekend)
regression_threads = pd.concat((regression_threads, category_map), axis=1)

In [5]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "author_all_activity_count",
] + list(category_map.columns)

extra_params = {
        "collection_window": 2,
        "model_window": 2,
        "validation_window":1,
        "x_cols": X_COLS,
    }

param_dict = RR.create_param_dict('dummy', 'logistic', regression_threads, thread_data, **extra_params)

In [6]:
test_regmod = RR(param_dict)
test_regmod.main()

regression_class_dummy_logistic - INFO - Running FSS
regression_class_dummy_logistic - INFO - Model 1
regression_class_dummy_logistic - INFO - Model 2
regression_class_dummy_logistic - INFO - Model 3
regression_class_dummy_logistic - INFO - Model 4
regression_class_dummy_logistic - INFO - Model 5
regression_class_dummy_logistic - INFO - Model 6
regression_class_dummy_logistic - INFO - Model 7
regression_class_dummy_logistic - INFO - Model 8


In [7]:
test_regmod.FSS_metrics['metric_df']

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)",[0.5414017017465294],0.541402,"(mean_author_sentiment_sign,)",,0.0,
2,"(3, 4)",[0.5607030900134349],0.560703,"(mean_author_sentiment_sign, mean_author_senti...",,0.0,
3,"(3, 4, 5)",[0.5722122704881326],0.572212,"(mean_author_sentiment_sign, mean_author_senti...",,0.0,
4,"(0, 3, 4, 5)",[0.5728168383340797],0.572817,"(sentiment_sign, mean_author_sentiment_sign, m...",,0.0,
5,"(0, 1, 3, 4, 5)",[0.5738916256157636],0.573892,"(sentiment_sign, sentiment_magnitude, mean_aut...",,0.0,
6,"(0, 1, 3, 4, 5, 6)",[0.5738916256157636],0.573892,"(sentiment_sign, sentiment_magnitude, mean_aut...",,0.0,
7,"(0, 1, 3, 4, 5, 6, 7)",[0.5738916256157636],0.573892,"(sentiment_sign, sentiment_magnitude, mean_aut...",,0.0,
8,"(0, 1, 2, 3, 4, 5, 6, 7)",[0.5678235557545902],0.567824,"(sentiment_sign, sentiment_magnitude, activity...",,0.0,


In [8]:
test_regmod.regression_metrics['regression_params']

{1:                                param        pvalue  conf_low  conf_high
 Intercept                   1.334357  4.855103e-25  1.081324   1.587391
 mean_author_sentiment_sign  0.169025  1.738894e-01 -0.074599   0.412648,
 2:                                     param        pvalue  conf_low  conf_high
 Intercept                        1.336446  4.942529e-25  1.082975   1.589917
 mean_author_sentiment_sign       0.180275  1.505774e-01 -0.065522   0.426072
 mean_author_sentiment_magnitude -0.086209  4.888404e-01 -0.330324   0.157907,
 3:                                     param        pvalue  conf_low  conf_high
 Intercept                        1.341728  5.583851e-25  1.086966   1.596489
 mean_author_sentiment_sign       0.147027  2.582891e-01 -0.107888   0.401942
 mean_author_sentiment_magnitude -0.059266  6.420402e-01 -0.309151   0.190620
 author_all_activity_count        0.139341  3.626252e-01 -0.160649   0.439331,
 4:                                     param        pvalue  conf_l

In [9]:
regression_threads['day_of_week'] = weekday_or_weekend
regression_threads

Unnamed: 0,thread_id,thread_size,authors,timestamp,author,score,subject_sentiment_score,sentiment_sign,sentiment_magnitude,success,Weekday,Weekend,day_of_week
0,xsglev,2541,439,2022-10-01 00:00:13,8ac426fd80f0ea3761bdcd7f32591b09ce1c1366e59ef15c,1,0.0000,0.0,0.0000,1,0,1,Weekend
1,xsgqgi,1,1,2022-10-01 00:06:27,41745e3272e1a87209a6a5749a2dafbd5cd3cf893eef1f2b,1,0.7096,1.0,0.7096,0,0,1,Weekend
2,xsgqkx,1,1,2022-10-01 00:06:36,083ef865b12614c169cdba4e71069fd8be0cc39865222e24,1,0.3804,1.0,0.3804,0,0,1,Weekend
3,xsgsb3,1,1,2022-10-01 00:08:49,d003c719ed03f48fc76d427eb3056f79590f84c7351b32a8,1,0.0000,0.0,0.0000,0,0,1,Weekend
4,xsgz7t,22,16,2022-10-01 00:17:36,1230e32c76caf095a665b716ae1b0101adc0161facd5ea70,1,-0.6858,-1.0,0.6858,1,0,1,Weekend
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,xwq30h,1,1,2022-10-05 23:40:14,a1a76e304e6725896c737cad404f2c324eb47fc22fc9a364,1,0.0000,0.0,0.0000,0,1,0,Weekday
2413,xwq41v,1,1,2022-10-05 23:41:34,8bed7f272474ed9b9360c9d00e57f2abd9a695a78b6bf18b,1,0.0000,0.0,0.0000,0,1,0,Weekday
2414,xwq4tj,6,5,2022-10-05 23:42:26,b4dbb952a0d2ac5a587f1c9b4cb680c2c70a782fed8985a3,1,0.0000,0.0,0.0000,1,1,0,Weekday
2415,xwqcyg,1,1,2022-10-05 23:52:48,73c82df7d9caa907c1f936e3277540d5c7ccce7b8e530f7a,1,0.0000,0.0,0.0000,0,1,0,Weekday


In [10]:
mod_key = 'success ~ sentiment_sign + sentiment_magnitude + mean_author_sentiment_sign + mean_author_sentiment_magnitude + author_all_activity_count + C(day_of_week)'

In [11]:
keys_to_use = ['name', 'regression_data', 'thread_data', 'collection_window', 'model_window', 'validation_window', 'scale', 'regression_type', 'metrics', 'thresholds']

In [12]:
new_param_dict = {"models": {0: mod_key}}
for k in keys_to_use:
    new_param_dict[k] = param_dict[k]

In [13]:
new_test_regmost = RR(new_param_dict)

In [14]:
new_test_regmost.get_cal_val_data()

In [15]:
new_test_regmost.sm_modstrings = new_test_regmost.regression_params["models"]

In [16]:
new_test_regmost.run_models()


regression_class_dummy_logistic - INFO - Model 0
regression_class_dummy_logistic - INFO - Model 0


In [24]:
new_test_regmost.regression_metrics['metrics']

Unnamed: 0,model_key,num_features,model,cal_auc,val_auc,optimizer,iterations,converged
0,0,6,success ~ sentiment_sign + sentiment_magnitude...,0.573981,0.610286,newton,6,True


In [22]:
new_test_regmost.regression_metrics['regression_params']

{0:                                     param        pvalue  conf_low  conf_high
 Intercept                        1.342241  5.615846e-25  1.087368   1.597113
 sentiment_sign                  -0.040360  7.605434e-01 -0.299917   0.219197
 sentiment_magnitude              0.015799  9.031170e-01 -0.238591   0.270189
 mean_author_sentiment_sign       0.150992  2.476894e-01 -0.105015   0.407000
 mean_author_sentiment_magnitude -0.055100  6.671896e-01 -0.306246   0.196045
 author_all_activity_count        0.133483  3.848425e-01 -0.167574   0.434540}

In [25]:
new_test_regmost.sm_modstrings

{0: 'success ~ sentiment_sign + sentiment_magnitude + mean_author_sentiment_sign + mean_author_sentiment_magnitude + author_all_activity_count + C(day_of_week)'}

In [28]:
new_test_regmost.__model_data__['cal']

Unnamed: 0,success,sentiment_magnitude,sentiment_sign,author_all_activity_count,mean_author_sentiment_magnitude,mean_author_sentiment_sign,day_of_week
0,1,-0.913613,-0.113338,-0.748332,-1.146116,-0.407579,Weekday
2,0,0.810539,1.186512,-0.722727,3.436224,-1.499416,Weekday
10,1,1.603523,-1.413189,-0.645913,-0.383034,-1.499416,Weekday
11,1,1.669003,1.186512,1.479292,-1.101720,0.684257,Weekday
12,1,-0.913613,-0.113338,-0.569098,1.320416,0.684257,Weekday
...,...,...,...,...,...,...,...
972,1,1.316766,1.186512,-0.748332,1.393392,0.684257,Weekday
977,1,-0.110243,1.186512,-0.697122,0.123638,0.684257,Weekday
993,1,-0.913613,-0.113338,0.275863,0.302432,0.684257,Weekday
994,1,-0.913613,-0.113338,0.685541,-0.808913,0.684257,Weekday


In [None]:
smf_model = getattr(
            smf, self.SMF_FUNCTIONS[self.regression_params["regression_type"]]
        )(self.sm_modstrings[mod_key], data=self.__model_data__["cal"])

In [31]:
smf_model = smf.logit(mod_key, data=new_test_regmost.__model_data__["cal"])

In [32]:
modelfit = smf_model.fit()

Optimization terminated successfully.
         Current function value: 0.509240
         Iterations 6


In [33]:
new_test_regmost.get_regression_metrics(modelfit, 0)

{'model_key': 0,
 'num_features': 6,
 'model': 'success ~ sentiment_sign + sentiment_magnitude + mean_author_sentiment_sign + mean_author_sentiment_magnitude + author_all_activity_count + C(day_of_week)',
 'cal_auc': 0.5739811912225705,
 'val_auc': 0.6102857142857142,
 'optimizer': 'newton',
 'iterations': 6,
 'converged': True}

In [34]:
new_test_regmost.get_model_metrics_from_smf_mod(modelfit)

Unnamed: 0,param,pvalue,conf_low,conf_high
Intercept,1.342241,5.615846e-25,1.087368,1.597113
sentiment_sign,-0.04036,0.7605434,-0.299917,0.219197
sentiment_magnitude,0.015799,0.903117,-0.238591,0.270189
mean_author_sentiment_sign,0.150992,0.2476894,-0.105015,0.407
mean_author_sentiment_magnitude,-0.0551,0.6671896,-0.306246,0.196045
author_all_activity_count,0.133483,0.3848425,-0.167574,0.43454


In [35]:
modelfit.params

Intercept                          1.342241
sentiment_sign                    -0.040360
sentiment_magnitude                0.015799
mean_author_sentiment_sign         0.150992
mean_author_sentiment_magnitude   -0.055100
author_all_activity_count          0.133483
dtype: float64

In [36]:
modelfit.pvalues

Intercept                          5.615846e-25
sentiment_sign                     7.605434e-01
sentiment_magnitude                9.031170e-01
mean_author_sentiment_sign         2.476894e-01
mean_author_sentiment_magnitude    6.671896e-01
author_all_activity_count          3.848425e-01
dtype: float64