In [13]:
#import sys
#sys.path.append('./reddit_analysis_code')
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sklearn.linear_model as sklin
from sklearn.metrics import classification_report, confusion_matrix
from patsy import dmatrices

In [78]:
infile = "calval_regression_data_times.p"
outfile = 'compare_sm_sklearn.csv'
remove = False
train = True

In [10]:
regression_thread_data = pickle.load(open(infile, 'rb'))
if remove:
    regression_thread_data.pop(remove)

regression_parameters = {}

In [11]:
sklearn_models={

    'books': {
    'y': 'success',
    'X_cols': ['post_activity_count', 'comment_activity_count', 'num_dayofweek']
    },

    'conspiracy': {
    'y': 'success',
    'X_cols': ['comment_activity_count']
    },

    'crypto': {
    'y': 'success',
    'X_cols': ['comment_activity_count']
    },

    'politics': {
    'y': 'success',
    'X_cols': ['comment_activity_count', 'post_activity_count']
    },
}

sm_models = {}
for key in sklearn_models:
    sm_models[key] = f"{sklearn_models[key]['y']} ~"
    for i, name in enumerate(sklearn_models[key]['X_cols']):
        if i != 0:
            sm_models[key] += ' +'

        sm_models[key] += f' {name}'

sm_models

{'books': 'success ~ post_activity_count + comment_activity_count + num_dayofweek',
 'conspiracy': 'success ~ comment_activity_count',
 'crypto': 'success ~ comment_activity_count',
 'politics': 'success ~ comment_activity_count + post_activity_count'}

In [12]:
models_df = pd.DataFrame.from_dict(sm_models, orient='index', columns=['model'])
regression_parameters['models'] = models_df

In [28]:
models_data = {}
for key in regression_thread_data:
    models_data[key] = {}
    
    calibration_data = regression_thread_data[key]['calibration']
    validation_data = regression_thread_data[key]['validation']

    models_data[key]['sm'] = {}
    smf_logistic_regression = smf.logit(sm_models[key], data=calibration_data).fit()
    models_data[key]['sm']['fit_params'] = smf_logistic_regression.params

    models_data[key]['sklearn'] = {}
    skl_X_cols = sklearn_models[key]['X_cols']
    skl_logistic_regression = sklin.LogisticRegression().fit(calibration_data[skl_X_cols], calibration_data.success)
    models_data[key]['sklearn']['fit_coefs'] = skl_logistic_regression.coef_
    models_data[key]['sklearn']['intercept'] = skl_logistic_regression.intercept_

    #models_data[key]['sklearn']['params'] = skl_logistic_regression.get_params()


    y_real = calibration_data.success

    smf_y_pred = smf_logistic_regression.predict()
    smf_auc = metrics.roc_auc_score(y_real, smf_y_pred)
    models_data[key]['sm']['calibration_auc'] = smf_auc

    models_data[key]['sklearn']['calibration_auc'] = metrics.roc_auc_score(y_real, skl_logistic_regression.predict_proba(calibration_data[skl_X_cols])[:, 1])

    y_test_real = validation_data.success
    smf_y_test_pred = smf_logistic_regression.predict(
        exog = validation_data
    )
    smf_test_auc = metrics.roc_auc_score(y_test_real, smf_y_test_pred)
    models_data[key]['sm']['validation_auc'] = smf_test_auc

    models_data[key]['sklearn']['validation_auc'] = metrics.roc_auc_score(
        y_test_real, skl_logistic_regression.predict_proba(validation_data[skl_X_cols])[:,1]
    )
    



Optimization terminated successfully.
         Current function value: 0.581954
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.414659
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.642524
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626123
         Iterations 6


In [29]:
models_data['books']


{'sm': {'fit_params': Intercept                -0.903772
  post_activity_count      -0.051011
  comment_activity_count    0.161383
  num_dayofweek             0.006958
  dtype: float64,
  'calibration_auc': 0.7816629596232842,
  'validation_auc': 0.7895472582972582},
 'sklearn': {'fit_coefs': array([[-0.05100267,  0.16135957,  0.006955  ]]),
  'intercept': array([-0.90374562]),
  'calibration_auc': 0.7816629596232842,
  'validation_auc': 0.7895472582972582}}

In [36]:
models_data['books']['sm']['fit_params'].index.str.lower()

Index(['intercept', 'post_activity_count', 'comment_activity_count',
       'num_dayofweek'],
      dtype='object')

In [69]:
regression_results = {}

for subr in models_data:
    models_params = {}
    for key in models_data[subr]:
        print(key)
        models_params[key] = {}
        for param in models_data[subr][key]:
            print(param)
            if (param == 'fit_params'):
                coef_list = list(models_data[subr][key][param].index.str.lower())
                for i, coef in enumerate(models_data[subr][key][param]):
                    models_params[key][coef_list[i]] = coef
            elif (param == 'fit_coefs'):
                for i, coef in enumerate(models_data[subr][key][param][0]):
                    models_params[key][coef_list[i+1]] = coef
            else:
                coef = models_data[subr][key][param]
                if (isinstance(coef, list)) | (isinstance(coef, np.ndarray)):
                    coef = coef[0]
                models_params[key][param] = coef
    
    regression_results[subr] = pd.DataFrame.from_dict(models_params)
    regression_results[subr].columns = pd.MultiIndex.from_arrays(
        [[subr]*len(regression_results[subr].columns),
         regression_results[subr].columns]
        )




sm
fit_params
calibration_auc
validation_auc
sklearn
fit_coefs
intercept
calibration_auc
validation_auc
sm
fit_params
calibration_auc
validation_auc
sklearn
fit_coefs
intercept
calibration_auc
validation_auc
sm
fit_params
calibration_auc
validation_auc
sklearn
fit_coefs
intercept
calibration_auc
validation_auc
sm
fit_params
calibration_auc
validation_auc
sklearn
fit_coefs
intercept
calibration_auc
validation_auc


In [73]:
pd.concat((regression_results['books'], regression_results['crypto']), axis=1)

Unnamed: 0_level_0,books,books,crypto,crypto
Unnamed: 0_level_1,sm,sklearn,sm,sklearn
intercept,-0.903772,-0.903746,-0.448157,-0.448156
post_activity_count,-0.051011,-0.051003,,
comment_activity_count,0.161383,0.16136,0.003305,0.003305
num_dayofweek,0.006958,0.006955,,
calibration_auc,0.781663,0.781663,0.81859,0.81859
validation_auc,0.789547,0.789547,0.831021,0.831021


In [76]:
started = False
for key in regression_results:
    if not started:
        out_df = regression_results[key]
        started = True
    else:
        out_df = pd.concat((out_df, regression_results[key]), axis=1)

In [77]:
out_df

Unnamed: 0_level_0,books,books,conspiracy,conspiracy,crypto,crypto,politics,politics
Unnamed: 0_level_1,sm,sklearn,sm,sklearn,sm,sklearn,sm,sklearn
intercept,-0.903772,-0.903746,1.395165,1.395165,-0.448157,-0.448156,0.24144,0.24144
post_activity_count,-0.051011,-0.051003,,,,,0.005108,0.005108
comment_activity_count,0.161383,0.16136,0.007241,0.007241,0.003305,0.003305,0.005228,0.005228
num_dayofweek,0.006958,0.006955,,,,,,
calibration_auc,0.781663,0.781663,0.777406,0.777406,0.81859,0.81859,0.714616,0.714616
validation_auc,0.789547,0.789547,0.806135,0.806135,0.831021,0.831021,0.688734,0.688734


In [79]:
out_df.to_csv(outfile)