In [39]:
#import sys
#sys.path.append('./reddit_analysis_code')
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sklearn.linear_model as sklin
from patsy import dmatrices

In [72]:
infile = "calval_regression_data_times.p"
outfile = 'author_activity_calibrate_validate_regressions_sklearn_test.xlsx'
remove = False
train = True
COLS_TO_USE = {
    'post_activity_count': 'n',
    'comment_activity_count': 'n',
    'weekday': 'C',
    'hour': 'C',
    'num_dayofweek': 'n',
    'time_in_secs': 'n'
}
y_col = "success"

In [23]:
regression_thread_data = pickle.load(open(infile, 'rb'))
if remove:
    regression_thread_data.pop(remove)


In [24]:
regression_thread_data['books']['calibration'].columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'all_activity_count', 'post_activity_count',
       'comment_activity_count', 'hour', 'time', 'weekday', 'time_in_secs',
       'num_dayofweek'],
      dtype='object')

In [25]:
def log_plus_1(x):
    return np.log(x+1)

def boxcox_plus_1(x):
    return scpstat.boxcox(x+1)

"""
for key in regression_thread_data:
    if train:
        for type in regression_thread_data[key]:
            regression_thread_data[key][type]['log_sentiment_magnitude_plus_one'] = log_plus_1(regression_thread_data[key][type].sentiment_magnitude)
            regression_thread_data[key][type]['sqrt_sentiment_magnitude'] = np.sqrt(regression_thread_data[key][type].sentiment_magnitude)
            regression_thread_data[key][type]['log_post_activity_count'] = np.log(regression_thread_data[key][type].post_activity_count)
            regression_thread_data[key][type]['sqrt_post_activity_count'] = np.sqrt(regression_thread_data[key][type].post_activity_count)
    else:
        regression_thread_data[key]['log_sentiment_magnitude_plus_one'] = log_plus_1(regression_thread_data[key].sentiment_magnitude)
        regression_thread_data[key]['sqrt_sentiment_magnitude'] = np.sqrt(regression_thread_data[key].sentiment_magnitude)
        regression_thread_data[key]['log_post_activity_count'] = np.log(regression_thread_data[key].post_activity_count)
        regression_thread_data[key]['sqrt_post_activity_count'] = np.sqrt(regression_thread_data[key].post_activity_count)
"""
used_cols = COLS_TO_USE.copy()
for subr in regression_thread_data:
    if train:
        for type in regression_thread_data[subr]:
            for col in COLS_TO_USE:
                if COLS_TO_USE[col] == 'n':
                    regression_thread_data[subr][type][f'log_plus_1_{col}'] = log_plus_1(regression_thread_data[subr][type][col])
                    regression_thread_data[subr][type][f'sqrt_{col}'] = np.sqrt(regression_thread_data[subr][type][col])

                    used_cols.update({
                        f'log_plus_1_{col}': 'n',
                        f'sqrt_{col}': 'n',
                    })
    else:
        for col in COLS_TO_USE:
            if COLS_TO_USE[col] == 'n':
                regression_thread_data[subr][f'log_plus_1_{col}'] = log_plus_1(regression_thread_data[subr][col])
                regression_thread_data[subr][f'sqrt_{col}'] = np.sqrt(regression_thread_data[subr][col])

                used_cols.update({
                    f'log_plus_1_{col}': 'n',
                    f'sqrt_{col}': 'n',
                })


In [26]:
used_cols

{'post_activity_count': 'n',
 'comment_activity_count': 'n',
 'weekday': 'C',
 'hour': 'C',
 'num_dayofweek': 'n',
 'time_in_secs': 'n',
 'log_plus_1_post_activity_count': 'n',
 'sqrt_post_activity_count': 'n',
 'log_plus_1_comment_activity_count': 'n',
 'sqrt_comment_activity_count': 'n',
 'log_plus_1_num_dayofweek': 'n',
 'sqrt_num_dayofweek': 'n',
 'log_plus_1_time_in_secs': 'n',
 'sqrt_time_in_secs': 'n'}

In [27]:
def get_skewness(df):
    skew_dict = {}
    for key in df:
        dummy_dict = {}
        if train:
            for type in df[key]:
                for col in used_cols:
                    if used_cols[col] == 'n':
                        dummy_dict[col] = scpstat.skew(df[key][type][col])
        else:
            for col in used_cols:
                if used_cols[col] == 'n':
                    dummy_dict[col] = scpstat.skew(df[key][col])
        skew_dict[key] = pd.DataFrame.from_dict(dummy_dict, orient='index').rename(columns={0: key})
    
    skew_df = pd.DataFrame({'A' : []})

    for key in skew_dict:
        if skew_df.empty:
            skew_df = skew_dict[key]
        else:
            skew_df = pd.concat((skew_df, skew_dict[key]), axis=1)
    
    return skew_df


In [28]:
vanilla_models = [
    "success ~ sentiment_sign - 1",
    "success ~ sentiment_sign",
    "success ~ sentiment_magnitude - 1",
    "success ~ sentiment_magnitude",
    "success ~ sentiment_sign + sentiment_magnitude - 1",
    "success ~ sentiment_sign + sentiment_magnitude",
    "success ~ sentiment_sign*sentiment_magnitude - 1",
    "success ~ sentiment_sign*sentiment_magnitude",
    "success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude - 1",
    "success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude",
]

author_activity_models = [
    "success ~ post_activity_count - 1",
    "success ~ post_activity_count",
    "success ~ log_post_activity_count - 1",
    "success ~ log_post_activity_count",
    "success ~ sqrt_post_activity_count - 1",
    "success ~ sqrt_post_activity_count",
]

log_models = [
    "success ~ log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign + log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one + log_sentiment_magnitude_plus_one + sentiment_sign - 1",
    "success ~ log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign + log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one + log_sentiment_magnitude_plus_one + sentiment_sign" ,
]

sqrt_models = [
    "success ~ sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign + sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude + sqrt_sentiment_magnitude + sentiment_sign - 1",
    "success ~ sqrt_sentiment_magnitude",
    "success ~ sentiment_sign + sqrt_sentiment_magnitude",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude + sqrt_sentiment_magnitude + sentiment_sign",
]

In [29]:
used_cols

{'post_activity_count': 'n',
 'comment_activity_count': 'n',
 'weekday': 'C',
 'hour': 'C',
 'num_dayofweek': 'n',
 'time_in_secs': 'n',
 'log_plus_1_post_activity_count': 'n',
 'sqrt_post_activity_count': 'n',
 'log_plus_1_comment_activity_count': 'n',
 'sqrt_comment_activity_count': 'n',
 'log_plus_1_num_dayofweek': 'n',
 'sqrt_num_dayofweek': 'n',
 'log_plus_1_time_in_secs': 'n',
 'sqrt_time_in_secs': 'n'}

In [30]:
simple_models = []
for key in used_cols:
    modelstr = f"{y_col} ~"
    if used_cols[key] == 'n':
        modelstr += f" {key}"
    else:
        modelstr += f' C({key})'
    simple_models.append(modelstr)
    simple_models.append(modelstr + " - 1")

In [31]:
simple_models

['success ~ post_activity_count',
 'success ~ post_activity_count - 1',
 'success ~ comment_activity_count',
 'success ~ comment_activity_count - 1',
 'success ~ C(weekday)',
 'success ~ C(weekday) - 1',
 'success ~ C(hour)',
 'success ~ C(hour) - 1',
 'success ~ num_dayofweek',
 'success ~ num_dayofweek - 1',
 'success ~ time_in_secs',
 'success ~ time_in_secs - 1',
 'success ~ log_plus_1_post_activity_count',
 'success ~ log_plus_1_post_activity_count - 1',
 'success ~ sqrt_post_activity_count',
 'success ~ sqrt_post_activity_count - 1',
 'success ~ log_plus_1_comment_activity_count',
 'success ~ log_plus_1_comment_activity_count - 1',
 'success ~ sqrt_comment_activity_count',
 'success ~ sqrt_comment_activity_count - 1',
 'success ~ log_plus_1_num_dayofweek',
 'success ~ log_plus_1_num_dayofweek - 1',
 'success ~ sqrt_num_dayofweek',
 'success ~ sqrt_num_dayofweek - 1',
 'success ~ log_plus_1_time_in_secs',
 'success ~ log_plus_1_time_in_secs - 1',
 'success ~ sqrt_time_in_secs',
 '

In [32]:
to_combine = ['comment_activity_count', 'post_activity_count']
to_combine = [to_combine, to_combine+['num_dayofweek']]
to_combine
pair_models = []
for pairs in to_combine:
    modelstr = f"{y_col} ~"
    for i, key in enumerate(pairs):
        if i!= 0:
            modelstr += ' +'
        if used_cols[key] == 'n':
            modelstr += f" {key}"
        else:
            modelstr += f' C({key})'
    pair_models.append(modelstr)
    pair_models.append(modelstr + " - 1")

In [33]:
pair_models

['success ~ comment_activity_count + post_activity_count',
 'success ~ comment_activity_count + post_activity_count - 1',
 'success ~ comment_activity_count + post_activity_count + num_dayofweek',
 'success ~ comment_activity_count + post_activity_count + num_dayofweek - 1']

In [34]:
models = simple_models + pair_models

In [35]:
#models = vanilla_models + log_models + sqrt_models
#models = author_activity_models
models

['success ~ post_activity_count',
 'success ~ post_activity_count - 1',
 'success ~ comment_activity_count',
 'success ~ comment_activity_count - 1',
 'success ~ C(weekday)',
 'success ~ C(weekday) - 1',
 'success ~ C(hour)',
 'success ~ C(hour) - 1',
 'success ~ num_dayofweek',
 'success ~ num_dayofweek - 1',
 'success ~ time_in_secs',
 'success ~ time_in_secs - 1',
 'success ~ log_plus_1_post_activity_count',
 'success ~ log_plus_1_post_activity_count - 1',
 'success ~ sqrt_post_activity_count',
 'success ~ sqrt_post_activity_count - 1',
 'success ~ log_plus_1_comment_activity_count',
 'success ~ log_plus_1_comment_activity_count - 1',
 'success ~ sqrt_comment_activity_count',
 'success ~ sqrt_comment_activity_count - 1',
 'success ~ log_plus_1_num_dayofweek',
 'success ~ log_plus_1_num_dayofweek - 1',
 'success ~ sqrt_num_dayofweek',
 'success ~ sqrt_num_dayofweek - 1',
 'success ~ log_plus_1_time_in_secs',
 'success ~ log_plus_1_time_in_secs - 1',
 'success ~ sqrt_time_in_secs',
 '

In [36]:
regression_parameters = {}

In [37]:
models_df = pd.DataFrame(data=models, columns=['keys'])
regression_parameters['model_key'] = models_df

In [69]:
sklearn_models={

    'books': {
    'y': 'success',
    'X_cols': ['post_activity_count', 'comment_activity_count', 'num_dayofweek']
    },

    'conspiracy': {
    'y': 'success',
    'X_cols': ['comment_activity_count']
    },

    'crypto': {
    'y': 'success',
    'X_cols': ['comment_activity_count']
    },

    'politics': {
    'y': 'success',
    'X_cols': ['comment_activity_count', 'post_activity_count']
    },
}

In [38]:
def perform_models_regressions(dataset, models=models):

    # perform regressions
    regression_parameters_by_model = {}
    for i, model in enumerate(models):
        regression_parameters_dict = {}
        for key in dataset:
            logistic_regression = smf.logit(
                model,
                data=dataset[key]
            ).fit()
            # fit parameter dictionary
            regression_parameters_dict[key] = logistic_regression.params

            # calculate AUC
            y_predicted = logistic_regression.predict()
            y_real = dataset[key].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)

            regression_parameters_dict[key]['auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params
    

In [70]:
def perform_calibration_validation_regressions(dataset, models=models):

    # perform regressions
    regression_parameters_by_model = {}
    for i, model in enumerate(models):
        regression_parameters_dict = {}
        for key in dataset:
            logistic_regression = smf.logit(
                model,
                data=dataset[key]['calibration']
            ).fit()
            # fit parameter dictionary
            regression_parameters_dict[key] = logistic_regression.params

            # calculate calibration AUC
            y_predicted = logistic_regression.predict()
            y_real = dataset[key]['calibration'].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)

            regression_parameters_dict[key]['calibration_auc'] = auc

            # calculate validation AUC
            y_predicted = logistic_regression.predict(
                exog = dataset[key]['validation']
            )
            y_real = dataset[key]['validation'].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)
            regression_parameters_dict[key]['validation_auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params


def perform_sklearn_calibration_validation_regressions(dataset, models=models):

    # perform regressions
    regression_parameters_by_model = {}
    i=0
    for key in sklearn_models:
        
        regression_parameters_dict = {}

        y = dataset[key]['calibration'][sklearn_models[key]['y']]
        X = dataset[key]['calibration'][sklearn_models[key]['X_cols']]
        print(f"{key}   {len(X)}")
        logistic_regression = sklin.LinearRegression(
            fit_intercept=fit_intercept
            ).fit(X, y)
        
        # fit parameter dictionary
        regression_parameters_dict[key] = logistic_regression.get_params()

        # calculate calibration AUC
        y_predicted = logistic_regression.predict(X)
        y_real = dataset[key]['calibration'].success
        auc = metrics.roc_auc_score(y_real,  y_predicted)

        regression_parameters_dict[key]['calibration_auc'] = auc

        # calculate validation AUC
        y_predicted = logistic_regression.predict(
            dataset[key]['validation'][sklearn_models[key]['X_cols']]
        )
        y_real = dataset[key]['validation'].success
        auc = metrics.roc_auc_score(y_real,  y_predicted)
        regression_parameters_dict[key]['validation_auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)

        i+=1
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params

In [18]:
datasets_size = {}
for key in regression_thread_data:
    datasets_size[key] = {}
    datasets_size[key]['all'] = len(regression_thread_data[key])

In [71]:
# ALL DATA
skewness = {}
skewness['all_data'] = get_skewness(regression_thread_data)
if not train:
    regression_parameters['all_data'] = perform_models_regressions(regression_thread_data)
else:
    #regression_parameters['all_data'] = perform_calibration_validation_regressions(regression_thread_data)
    regression_parameters['all_data'] = perform_sklearn_calibration_validation_regressions(regression_thread_data, models=sklearn_models)

books   3343
conspiracy   7976
crypto   10372
politics   45740


In [42]:
# EXCLUDE NEUTRALS
non_neutral_regression_thread_data = {}
for key in regression_thread_data:
    non_neutral_regression_thread_data[key] = regression_thread_data[key][regression_thread_data[key].sentiment_sign != 0]
    datasets_size[key]['no_neutral'] = len(non_neutral_regression_thread_data[key])

skewness['no_neutrals'] = get_skewness(non_neutral_regression_thread_data)
regression_parameters['exclude_neutrals'] = perform_models_regressions(non_neutral_regression_thread_data)

Optimization terminated successfully.
         Current function value: 0.667485
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.582331
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.641644
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.683472
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.644162
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.394416
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614413
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.635348
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.669763
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.473450
  

In [43]:
# THRESHOLD AUTHORS

post_count_thresholds = [2, 5, 10, 20]

author_counts = {}
for key in regression_thread_data:
    author_counts[key] = (
        regression_thread_data[key][['thread_id', 'author']]
        .groupby('author').count().reset_index()
        .rename(columns={'thread_id': 'post_count'})
    )

for threshold in post_count_thresholds:
    threshold_regression_data = {}
    for key in regression_thread_data:
        authors = author_counts[key][author_counts[key].post_count >= threshold].author
        threshold_regression_data[key] = regression_thread_data[key][regression_thread_data[key].author.isin(authors)]
        datasets_size[key][f'threshold_{threshold}'] = len(threshold_regression_data[key])
    skewness[f'author_threshold_{threshold}'] = get_skewness(threshold_regression_data)
    regression_parameters[f'author_threshold_{threshold}'] = perform_models_regressions(threshold_regression_data)



Optimization terminated successfully.
         Current function value: 0.646081
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.558590
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.637340
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.682482
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.581473
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.388584
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.620109
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583462
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.608185
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.421979
  

In [44]:
size_df = pd.DataFrame({'A' : []})
for key in datasets_size:
    if size_df.empty:
        size_df = pd.DataFrame.from_dict(datasets_size[key], orient='index').rename(columns={0: key})
    else:
        size_df = pd.concat(
            (
                size_df,
                pd.DataFrame.from_dict(datasets_size[key], orient='index').rename(columns={0: key})
            ),
            axis=1
        )

In [45]:
regression_parameters['data_sizes'] = size_df

In [42]:
# SKEWNESS SHEET
# add data subset to multiindex
for subset in skewness:
    skewness[subset].set_index(
        [[subset]*len(skewness[subset]),
        skewness[subset].index],
        inplace=True 
        )
    skewness[subset].index.set_names(['subset', 'parameter'], inplace=True)

# combine to form one df for all subsets run
all_subsets_skewness = pd.DataFrame({'A': []})
for key in skewness:
    if all_subsets_skewness.empty:
        all_subsets_skewness = skewness[key]
    else:
        all_subsets_skewness = pd.concat((all_subsets_skewness, skewness[key]))

In [43]:
regression_parameters['skewness'] = all_subsets_skewness

In [48]:
dummy_sizes = regression_parameters['data_sizes'].copy()
for key in dummy_sizes.columns:
    dummy_sizes[key] = [dummy_sizes.loc[row, key]/dummy_sizes.loc['all', key] for row in dummy_sizes.index]
dummy_sizes.index = [f"{index}_ratio" for index in dummy_sizes.index]
dummy_sizes

Unnamed: 0,books,conspiracy,crypto,politics
all_ratio,1.0,1.0,1.0,1.0
no_neutral_ratio,0.484925,0.622817,0.495681,0.659382
threshold_2_ratio,0.33124,0.705397,0.702187,0.801341
threshold_5_ratio,0.095687,0.483458,0.431637,0.636044
threshold_10_ratio,0.044389,0.351645,0.32582,0.554306
threshold_20_ratio,0.029523,0.247389,0.202524,0.462314


In [49]:
regression_parameters['data_sizes'] = pd.concat((regression_parameters['data_sizes'], dummy_sizes))

In [73]:
with pd.ExcelWriter(outfile, engine='xlsxwriter') as writer:
    for key in regression_parameters:
        regression_parameters[key].to_excel(writer, sheet_name=key)