In [1]:
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices

In [2]:
infile = "calval_regression_data.p"
outfile = 'author_activity_calibrate_validate_regressions.xlsx'
remove = 'thedonald'
train = True

In [5]:
def log_plus_1(x):
    return np.log(x+1)

def get_skewness(df):
    skew_dict = {}
    for key in df:
        dummy_dict = {}
        if train:
            for type in df[key]:
                dummy_dict[f'{type}_sentiment_magnitude'] = scpstat.skew(df[key][type].sentiment_magnitude)
                dummy_dict[f'{type}_log(mag+1)'] = scpstat.skew(df[key][type].log_sentiment_magnitude_plus_one)
                dummy_dict[f'{type}_sqrt(mag)'] = scpstat.skew(df[key][type].sqrt_sentiment_magnitude)
                dummy_dict[f'{type}_post_activity_count'] = scpstat.skew(df[key][type].post_activity_count)
                dummy_dict[f'{type}_log(post_activity)'] = scpstat.skew(df[key][type].log_post_activity_count)
                dummy_dict[f'{type}_sqrt(post_activity)'] = scpstat.skew(df[key][type].sqrt_post_activity_count)

        else:
            dummy_dict['sentiment_magnitude'] = scpstat.skew(df[key].sentiment_magnitude)
            dummy_dict['log(mag+1)'] = scpstat.skew(df[key].log_sentiment_magnitude_plus_one)
            dummy_dict['sqrt(mag)'] = scpstat.skew(df[key].sqrt_sentiment_magnitude)
            dummy_dict['post_activity_count'] = scpstat.skew(df[key].post_activity_count)
            dummy_dict['log(post_activity)'] = scpstat.skew(df[key].log_post_activity_count)
            dummy_dict['sqrt(post_activity)'] = scpstat.skew(df[key].sqrt_post_activity_count)
        skew_dict[key] = pd.DataFrame.from_dict(dummy_dict, orient='index').rename(columns={0: key})

    skew_df = pd.DataFrame({'A' : []})

    for key in skew_dict:
        if skew_df.empty:
            skew_df = skew_dict[key]
        else:
            skew_df = pd.concat((skew_df, skew_dict[key]), axis=1)
    
    return skew_df

def perform_models_regressions(dataset, models):

    # perform regressions
    regression_parameters_by_model = {}
    for i, model in enumerate(models):
        regression_parameters_dict = {}
        for key in dataset:
            logistic_regression = smf.logit(
                model,
                data=dataset[key]
            ).fit()
            # fit parameter dictionary
            regression_parameters_dict[key] = logistic_regression.params

            # calculate AUC
            y_predicted = logistic_regression.predict()
            y_real = dataset[key].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)

            regression_parameters_dict[key]['auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params

def perform_calibration_validation_regressions(dataset, models):

    # perform regressions
    regression_parameters_by_model = {}
    for i, model in enumerate(models):
        regression_parameters_dict = {}
        for key in dataset:
            logistic_regression = smf.logit(
                model,
                data=dataset[key]['calibration']
            ).fit()
            # fit parameter dictionary
            regression_parameters_dict[key] = logistic_regression.params

            # calculate calibration AUC
            y_predicted = logistic_regression.predict()
            y_real = dataset[key]['calibration'].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)

            regression_parameters_dict[key]['calibration_auc'] = auc

            # calculate validation AUC
            y_predicted = logistic_regression.predict(
                exog = dataset[key]['validation']
            )
            y_real = dataset[key]['validation'].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)
            regression_parameters_dict[key]['validation_auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params

In [6]:
regression_thread_data = pickle.load(open(infile, 'rb'))
if remove:
    regression_thread_data.pop(remove)

for key in regression_thread_data:
    if train:
        for type in regression_thread_data[key]:
            regression_thread_data[key][type]['log_sentiment_magnitude_plus_one'] = log_plus_1(regression_thread_data[key][type].sentiment_magnitude)
            regression_thread_data[key][type]['sqrt_sentiment_magnitude'] = np.sqrt(regression_thread_data[key][type].sentiment_magnitude)
            regression_thread_data[key][type]['log_post_activity_count'] = np.log(regression_thread_data[key][type].post_activity_count)
            regression_thread_data[key][type]['sqrt_post_activity_count'] = np.sqrt(regression_thread_data[key][type].post_activity_count)
    else:
        regression_thread_data[key]['log_sentiment_magnitude_plus_one'] = log_plus_1(regression_thread_data[key].sentiment_magnitude)
        regression_thread_data[key]['sqrt_sentiment_magnitude'] = np.sqrt(regression_thread_data[key].sentiment_magnitude)
        regression_thread_data[key]['log_post_activity_count'] = np.log(regression_thread_data[key].post_activity_count)
        regression_thread_data[key]['sqrt_post_activity_count'] = np.sqrt(regression_thread_data[key].post_activity_count)

In [11]:
regression_thread_data['books']['calibration'].columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'all_activity_count', 'post_activity_count',
       'comment_activity_count', 'log_sentiment_magnitude_plus_one',
       'sqrt_sentiment_magnitude', 'log_post_activity_count',
       'sqrt_post_activity_count'],
      dtype='object')

In [20]:
# adapted from https://www.datasklr.com/ols-least-squares-regression/variable-selection
#RECURSIVE FEATURE ELIMINATION
#Feature ranking with recursive feature elimination and cross-validated selection of the best number of features

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression


X = regression_thread_data['crypto']['calibration'][[
    'sentiment_sign', 'sentiment_magnitude', 'post_activity_count',
    'comment_activity_count', 'log_sentiment_magnitude_plus_one',
    'sqrt_sentiment_magnitude', 'log_post_activity_count',
    'sqrt_post_activity_count'
    ]]
y = regression_thread_data['crypto']['calibration']['success']


names=pd.DataFrame(X.columns)

#use logistic regression as the model
log_reg = LogisticRegression()

#This is to select 5 variables: can be changed and checked in model for accuracy
rfecv_mod = RFECV(log_reg, step=1, cv=5) 
myvalues=rfecv_mod.fit(X,y) #to fit
myvalues.support_#The mask of selected features.
myvalues.ranking_ #The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.

rankings=pd.DataFrame(myvalues.ranking_) #Make it into data frame
rankings

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1


In [21]:
ranked=pd.concat([names,rankings], axis=1)
ranked.columns = ["Feature", "Rank"]
ranked

#Select most important (Only 1's)
most_important = ranked.loc[ranked['Rank'] ==1] 
print(most_important)

most_important['Rank'].count()

                            Feature  Rank
0                    sentiment_sign     1
1               sentiment_magnitude     1
2               post_activity_count     1
3            comment_activity_count     1
4  log_sentiment_magnitude_plus_one     1
5          sqrt_sentiment_magnitude     1
6           log_post_activity_count     1
7          sqrt_post_activity_count     1


8