In [1]:
# data manipulation imports
import pandas as pd
import numpy as np

# data saving imports
import pickle

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass
from regression_class import QuantileClass as qc

# stats imports
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import metrics
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [2]:
data = pickle.load(open('test_data_crypto_5_days.p', 'rb'))
regression_threads = data['regression_data']
all_data = data['all_data']

In [3]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "time_in_secs",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "author_all_activity_count",
]
quantiles = [0.25, 0.5, 0.75]
thresholds2 = {
    "author_all_activity_count": 2,
    "thread_size": 2,
}
thresholds1 = {
    "author_all_activity_count": 2,
}

regression_params = {
    'regression_data': regression_threads,
    'thread_data': all_data,
    'collection_window': 2,
    'model_window': 2,
    'validation_window': 1,
    'FSS': True,
    'x_cols': X_COLS,
    'scale': True,
}

In [4]:
regression_types = ['logistic', 'linear', 'mnlogit']
to_vary = {
    'name': [f'crypto_{x}' for x in regression_types],
    'regression_type': regression_types,
    'y_col': ['success', 'thread_size', 'thread_size'],
    'metrics': [['auc'], ['r2'], ['mnlogit_accuracy', 'mnlogit_aucs', "mnlogit_mean_auc"]],
    'thresholds': [thresholds1, thresholds2, thresholds2],
    'quantiles': [[], [], quantiles],
}


In [5]:
regressions = {}
for i, regression_type in enumerate(regression_types):
    in_params = regression_params.copy()
    for key in to_vary:
        in_params[key] = to_vary[key][i]
    regressions[regression_type] = RR(in_params)

In [6]:
# start running
for key in regressions:
    print(f"Running {key}")
    regressions[key].main()

Running logistic
Running FSS


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Model 1
Optimization terminated successfully.
         Current function value: 0.510608
         Iterations 5
Model 2
Optimization terminated successfully.
         Current function value: 0.508381
         Iterations 5
Model 3
Optimization terminated successfully.
         Current function value: 0.505657
         Iterations 5
Model 4
Optimization terminated successfully.
         Current function value: 0.504721
         Iterations 5
Model 5
Optimization terminated successfully.
         Current function value: 0.504703
         Iterations 5
Model 6
Optimization terminated successfully.
         Current function value: 0.503623
         Iterations 6
Model 7
Optimization terminated successfully.
         Current function value: 0.503604
         Iterations 6
Model 8
Optimization terminated successfully.
         Current function value: 0.503476
         Iterations 6
Model 9
Optimization terminated successfully.
         Current function value: 0.502169
         Iterations 6
Running li

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Model 9
Running mnlogit
Running FSS


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Model 1
Optimization terminated successfully.
         Current function value: 1.368356
         Iterations 4
Model 2
Optimization terminated successfully.
         Current function value: 1.359630
         Iterations 5
Model 3
Optimization terminated successfully.
         Current function value: 1.342283
         Iterations 5
Model 4
Optimization terminated successfully.
         Current function value: 1.328100
         Iterations 6
Model 5
Optimization terminated successfully.
         Current function value: 1.317655
         Iterations 6
Model 6
Optimization terminated successfully.
         Current function value: 1.313113
         Iterations 6
Model 7
Optimization terminated successfully.
         Current function value: 1.312710
         Iterations 6
Model 8
Optimization terminated successfully.
         Current function value: 1.305442
         Iterations 6
Model 9
Optimization terminated successfully.
         Current function value: 1.302246
         Iterations 6


In [7]:
for key in regressions:
    print(f"\n###{key}###\n\n")

    print(f"\nMetric df\n")
    display(regressions[key].FSS_metrics['metric_df'])

    if key == 'mnlogit':
        print(f"\nQuantile metrics\n")
        display(regressions[key].quantile_data)
    
    print(f"\nRegression metrics\n")
    display(regressions[key].regression_metrics['metrics'])

    print(f"\n\n#######################################\n\n")


###logistic###



Metric df



Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)",[0.5574339453649798],0.557434,"(time_in_secs,)",,0.0,
2,"(3, 6)",[0.5737348858038513],0.573735,"(time_in_secs, mean_author_sentiment_sign)",,0.0,
3,"(3, 4, 6)",[0.5920958351992835],0.592096,"(time_in_secs, num_dayofweek, mean_author_sent...",,0.0,
4,"(3, 4, 6, 7)",[0.6015226153157188],0.601523,"(time_in_secs, num_dayofweek, mean_author_sent...",,0.0,
5,"(2, 3, 4, 6, 7)",[0.6015226153157188],0.601523,"(hour, time_in_secs, num_dayofweek, mean_autho...",,0.0,
6,"(2, 3, 4, 6, 7, 8)",[0.6014330497089119],0.601433,"(hour, time_in_secs, num_dayofweek, mean_autho...",,0.0,
7,"(1, 2, 3, 4, 6, 7, 8)",[0.5999552171965964],0.599955,"(sentiment_magnitude, hour, time_in_secs, num_...",,0.0,
8,"(0, 1, 2, 3, 4, 6, 7, 8)",[0.5993730407523512],0.599373,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,
9,"(0, 1, 2, 3, 4, 5, 6, 7, 8)",[0.5924764890282133],0.592476,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,



Regression metrics



Unnamed: 0,num_features,model,cal_auc,val_auc
1,1,success ~ time_in_secs,0.557434,0.512095
2,2,success ~ time_in_secs + mean_author_sentiment...,0.573645,0.527238
3,3,success ~ time_in_secs + num_dayofweek + mean_...,0.59223,0.526762
4,4,success ~ time_in_secs + num_dayofweek + mean_...,0.601523,0.546952
5,5,success ~ hour + time_in_secs + num_dayofweek ...,0.60197,0.547429
6,6,success ~ hour + time_in_secs + num_dayofweek ...,0.601836,0.595619
7,7,success ~ sentiment_magnitude + hour + time_in...,0.602015,0.596571
8,8,success ~ sentiment_sign + sentiment_magnitude...,0.599104,0.594857
9,9,success ~ sentiment_sign + sentiment_magnitude...,0.59391,0.56819




#######################################



###linear###



Metric df



Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(5,)",[0.04322679951979269],0.043227,"(activity_ratio,)",,0.0,
2,"(3, 5)",[0.08813560189325176],0.088136,"(time_in_secs, activity_ratio)",,0.0,
3,"(2, 3, 5)",[0.09663116959482398],0.096631,"(hour, time_in_secs, activity_ratio)",,0.0,
4,"(2, 3, 5, 8)",[0.10408895325692291],0.104089,"(hour, time_in_secs, activity_ratio, author_al...",,0.0,
5,"(1, 2, 3, 5, 8)",[0.10980884945626668],0.109809,"(sentiment_magnitude, hour, time_in_secs, acti...",,0.0,
6,"(1, 2, 3, 5, 6, 8)",[0.11436792284042074],0.114368,"(sentiment_magnitude, hour, time_in_secs, acti...",,0.0,
7,"(1, 2, 3, 5, 6, 7, 8)",[0.11888753395331153],0.118888,"(sentiment_magnitude, hour, time_in_secs, acti...",,0.0,
8,"(0, 1, 2, 3, 5, 6, 7, 8)",[0.12067953360296602],0.12068,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,
9,"(0, 1, 2, 3, 4, 5, 6, 7, 8)",[0.12072877827631667],0.120729,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,



Regression metrics



Unnamed: 0,num_features,model,r2
1,1,thread_size ~ activity_ratio,0.043227
2,2,thread_size ~ time_in_secs + activity_ratio,0.088136
3,3,thread_size ~ hour + time_in_secs + activity_r...,0.096631
4,4,thread_size ~ hour + time_in_secs + activity_r...,0.104089
5,5,thread_size ~ sentiment_magnitude + hour + tim...,0.109809
6,6,thread_size ~ sentiment_magnitude + hour + tim...,0.114368
7,7,thread_size ~ sentiment_magnitude + hour + tim...,0.118888
8,8,thread_size ~ sentiment_sign + sentiment_magni...,0.12068
9,9,thread_size ~ sentiment_sign + sentiment_magni...,0.120729




#######################################



###mnlogit###



Metric df



Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)",[0.3137931034482759],0.313793,"(time_in_secs,)",,0.0,
2,"(3, 6)",[0.35517241379310344],0.355172,"(time_in_secs, mean_author_sentiment_sign)",,0.0,
3,"(2, 3, 6)",[0.35517241379310344],0.355172,"(hour, time_in_secs, mean_author_sentiment_sign)",,0.0,
4,"(2, 3, 6, 8)",[0.3620689655172414],0.362069,"(hour, time_in_secs, mean_author_sentiment_sig...",,0.0,
5,"(2, 3, 5, 6, 8)",[0.3793103448275862],0.37931,"(hour, time_in_secs, activity_ratio, mean_auth...",,0.0,
6,"(0, 2, 3, 5, 6, 8)",[0.3896551724137931],0.389655,"(sentiment_sign, hour, time_in_secs, activity_...",,0.0,
7,"(0, 2, 3, 4, 5, 6, 8)",[0.3896551724137931],0.389655,"(sentiment_sign, hour, time_in_secs, num_dayof...",,0.0,
8,"(0, 1, 2, 3, 4, 5, 6, 8)",[0.38620689655172413],0.386207,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,
9,"(0, 1, 2, 3, 4, 5, 6, 7, 8)",[0.38620689655172413],0.386207,"(sentiment_sign, sentiment_magnitude, hour, ti...",,0.0,



Quantile metrics



{'quantile_ranges': [(2, 10), (11.0, 22), (23.0, 40), (41.0, 3465)],
 'quantile_counts':               count  val_count
 range                         
 (2, 10)          64         49
 (11.0, 22)       80         53
 (23.0, 40)       71         26
 (41.0, 3465)     75         47,
 'val_quantile_ranges': [(2, 10), (11.0, 22), (23.0, 40), (41.0, 3813)]}


Regression metrics



Unnamed: 0,num_features,model,cal_mnlogit_accuracy,val_mnlogit_accuracy,cal_mnlogit_aucs,val_mnlogit_aucs,cal_mnlogit_mean_auc,val_mnlogit_mean_auc
1,1,thread_size_quantile_index ~ time_in_secs,0.313793,0.32,"[0.5192201327433629, 0.5771428571428571, 0.537...","[0.5761256883705864, 0.577018249304052, 0.5002...",0.558459,0.539763
2,2,thread_size_quantile_index ~ time_in_secs + me...,0.355172,0.308571,"[0.537195796460177, 0.6124404761904763, 0.5338...","[0.5890832523485584, 0.5604701515620167, 0.536...",0.57644,0.54996
3,3,thread_size_quantile_index ~ hour + time_in_se...,0.37931,0.262857,"[0.5417588495575222, 0.6104166666666667, 0.623...","[0.5489148040168449, 0.5578410145375813, 0.427...",0.602527,0.514377
4,4,thread_size_quantile_index ~ hour + time_in_se...,0.389655,0.297143,"[0.6143528761061946, 0.6093452380952381, 0.631...","[0.6240686750890833, 0.5583049798948345, 0.433...",0.624756,0.547372
5,5,thread_size_quantile_index ~ hour + time_in_se...,0.393103,0.308571,"[0.6238938053097346, 0.6298214285714285, 0.654...","[0.5979915775834144, 0.599443241571296, 0.4731...",0.636883,0.559935
6,6,thread_size_quantile_index ~ sentiment_sign + ...,0.413793,0.274286,"[0.6085453539823009, 0.6374404761904762, 0.653...","[0.6059280855199223, 0.582276523352923, 0.4726...",0.634896,0.556584
7,7,thread_size_quantile_index ~ sentiment_sign + ...,0.413793,0.274286,"[0.6110342920353982, 0.6366071428571428, 0.654...","[0.6059280855199223, 0.580884627281163, 0.4731...",0.635724,0.556365
8,8,thread_size_quantile_index ~ sentiment_sign + ...,0.42069,0.24,"[0.6209900442477876, 0.6422023809523809, 0.666...","[0.5770975056689343, 0.55954222084751, 0.44889...",0.643529,0.531231
9,9,thread_size_quantile_index ~ sentiment_sign + ...,0.42069,0.251429,"[0.627212389380531, 0.640357142857143, 0.66467...","[0.5785552316164562, 0.5649551500154655, 0.446...",0.645107,0.532368




#######################################


