In [1]:
import sys
sys.path.append('./reddit_analysis_code')
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices

In [2]:
regression_thread_data = pickle.load(open("regression_thread_data.p", 'rb'))

In [3]:
def log_plus_1(x):
    return np.log(x+1)


for key in regression_thread_data:
    regression_thread_data[key]['log_sentiment_magnitude_plus_one'] = log_plus_1(regression_thread_data[key].sentiment_magnitude)
    regression_thread_data[key]['sqrt_sentiment_magnitude'] = np.sqrt(regression_thread_data[key].sentiment_magnitude)

In [4]:
def get_skewness(df):
    skew_dict = {}
    for key in df:
        dummy_dict = {}
        dummy_dict['sentiment_magnitude'] = scpstat.skew(df[key].sentiment_magnitude)
        dummy_dict['log(mag+1)'] = scpstat.skew(df[key].log_sentiment_magnitude_plus_one)
        dummy_dict['sqrt(mag)'] = scpstat.skew(df[key].sqrt_sentiment_magnitude)
        skew_dict[key] = pd.DataFrame.from_dict(dummy_dict, orient='index').rename(columns={0: key})

    skew_df = pd.DataFrame({'A' : []})

    for key in skew_dict:
        if skew_df.empty:
            skew_df = skew_dict[key]
        else:
            skew_df = pd.concat((skew_df, skew_dict[key]), axis=1)
    
    return skew_df

In [5]:
vanilla_models = [
    "success ~ sentiment_sign - 1",
    "success ~ sentiment_sign",
    "success ~ sentiment_magnitude - 1",
    "success ~ sentiment_magnitude",
    "success ~ sentiment_sign + sentiment_magnitude - 1",
    "success ~ sentiment_sign + sentiment_magnitude",
    "success ~ sentiment_sign*sentiment_magnitude - 1",
    "success ~ sentiment_sign*sentiment_magnitude",
    "success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude - 1",
    "success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude",
]

log_models = [
    "success ~ log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign + log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one - 1",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one + log_sentiment_magnitude_plus_one + sentiment_sign - 1",
    "success ~ log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign + log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one",
    "success ~ sentiment_sign*log_sentiment_magnitude_plus_one + log_sentiment_magnitude_plus_one + sentiment_sign" ,
]

sqrt_models = [
    "success ~ sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign + sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude - 1",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude + sqrt_sentiment_magnitude + sentiment_sign - 1",
    "success ~ sqrt_sentiment_magnitude",
    "success ~ sentiment_sign + sqrt_sentiment_magnitude",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude",
    "success ~ sentiment_sign*sqrt_sentiment_magnitude + sqrt_sentiment_magnitude + sentiment_sign",
]

In [6]:
models = vanilla_models + log_models + sqrt_models
models

['success ~ sentiment_sign - 1',
 'success ~ sentiment_sign',
 'success ~ sentiment_magnitude - 1',
 'success ~ sentiment_magnitude',
 'success ~ sentiment_sign + sentiment_magnitude - 1',
 'success ~ sentiment_sign + sentiment_magnitude',
 'success ~ sentiment_sign*sentiment_magnitude - 1',
 'success ~ sentiment_sign*sentiment_magnitude',
 'success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude - 1',
 'success ~ sentiment_sign*sentiment_magnitude + sentiment_sign + sentiment_magnitude',
 'success ~ log_sentiment_magnitude_plus_one - 1',
 'success ~ sentiment_sign + log_sentiment_magnitude_plus_one - 1',
 'success ~ sentiment_sign*log_sentiment_magnitude_plus_one - 1',
 'success ~ sentiment_sign*log_sentiment_magnitude_plus_one + log_sentiment_magnitude_plus_one + sentiment_sign - 1',
 'success ~ log_sentiment_magnitude_plus_one',
 'success ~ sentiment_sign + log_sentiment_magnitude_plus_one',
 'success ~ sentiment_sign*log_sentiment_magnitude_plus_one',
 '

In [7]:
regression_parameters = {}

In [8]:
models_df = pd.DataFrame(data=models, columns=['keys'])
regression_parameters['model_key'] = models_df

In [9]:
def perform_models_regressions(dataset, models=models):

    # perform regressions
    regression_parameters_by_model = {}
    for i, model in enumerate(models):
        regression_parameters_dict = {}
        for key in dataset:
            logistic_regression = smf.logit(
                model,
                data=dataset[key]
            ).fit()
            # fit parameter dictionary
            regression_parameters_dict[key] = logistic_regression.params

            # calculate AUC
            y_predicted = logistic_regression.predict()
            y_real = dataset[key].success
            auc = metrics.roc_auc_score(y_real,  y_predicted)

            regression_parameters_dict[key]['auc'] = auc

        regression_parameters_by_model[i] = pd.DataFrame.from_dict(regression_parameters_dict)
    
    # add model number to multiindex
    for model_number in regression_parameters_by_model:
        regression_parameters_by_model[model_number].set_index(
            [[model_number]*len(regression_parameters_by_model[model_number]),
            regression_parameters_by_model[model_number].index],
            inplace=True 
            )
        regression_parameters_by_model[model_number].index.set_names(['model', 'parameter'], inplace=True)
    
    # combine to form one df for all models run
    for key in regression_parameters_by_model:
        if key == 0:
            all_regression_params = regression_parameters_by_model[key]
        else:
            all_regression_params = pd.concat((all_regression_params, regression_parameters_by_model[key]))
        
    return all_regression_params
    

In [10]:
skewness = {}
skewness['all_data'] = get_skewness(regression_thread_data)

In [11]:
datasets_size = {}
for key in regression_thread_data:
    datasets_size[key] = {}
    datasets_size[key]['all'] = len(regression_thread_data[key])

In [12]:
# ALL DATA

regression_parameters['all_data'] = perform_models_regressions(regression_thread_data)

Optimization terminated successfully.
         Current function value: 0.686096
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684862
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689223
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691434
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.691581
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.630533
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.424192
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.686714
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.637998
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.645007
  

In [13]:
# EXCLUDE NEUTRALS
non_neutral_regression_thread_data = {}
for key in regression_thread_data:
    non_neutral_regression_thread_data[key] = regression_thread_data[key][regression_thread_data[key].sentiment_sign != 0]
    datasets_size[key]['no_neutral'] = len(non_neutral_regression_thread_data[key])

skewness['no_neutrals'] = get_skewness(non_neutral_regression_thread_data)
regression_parameters['exclude_neutrals'] = perform_models_regressions(non_neutral_regression_thread_data)

Optimization terminated successfully.
         Current function value: 0.678606
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.679845
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.685231
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690227
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690773
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.644205
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.395411
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.683691
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.643008
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.634551
  

In [14]:
# THRESHOLD AUTHORS

post_count_thresholds = [2, 5, 10, 20]

author_counts = {}
for key in regression_thread_data:
    author_counts[key] = (
        regression_thread_data[key][['thread_id', 'author']]
        .groupby('author').count().reset_index()
        .rename(columns={'thread_id': 'post_count'})
    )

for threshold in post_count_thresholds:
    threshold_regression_data = {}
    for key in regression_thread_data:
        authors = author_counts[key][author_counts[key].post_count >= threshold].author
        threshold_regression_data[key] = regression_thread_data[key][regression_thread_data[key].author.isin(authors)]
        datasets_size[key][f'threshold_{threshold}'] = len(threshold_regression_data[key])
    skewness[f'author_threshold_{threshold}'] = get_skewness(threshold_regression_data)
    regression_parameters[f'author_threshold_{threshold}'] = perform_models_regressions(threshold_regression_data)



Optimization terminated successfully.
         Current function value: 0.688590
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684808
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691690
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.684679
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691340
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.582520
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.389199
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.685627
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.623881
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.587595
  

In [15]:
size_df = pd.DataFrame({'A' : []})
for key in datasets_size:
    if size_df.empty:
        size_df = pd.DataFrame.from_dict(datasets_size[key], orient='index').rename(columns={0: key})
    else:
        size_df = pd.concat(
            (
                size_df,
                pd.DataFrame.from_dict(datasets_size[key], orient='index').rename(columns={0: key})
            ),
            axis=1
        )

In [16]:
regression_parameters['data_sizes'] = size_df

In [18]:
# SKEWNESS SHEET
# add data subset to multiindex
for subset in skewness:
    skewness[subset].set_index(
        [[subset]*len(skewness[subset]),
        skewness[subset].index],
        inplace=True 
        )
    skewness[subset].index.set_names(['subset', 'parameter'], inplace=True)

# combine to form one df for all subsets run
all_subsets_skewness = pd.DataFrame({'A': []})
for key in skewness:
    if all_subsets_skewness.empty:
        all_subsets_skewness = skewness[key]
    else:
        all_subsets_skewness = pd.concat((all_subsets_skewness, skewness[key]))

In [20]:
regression_parameters['skewness'] = all_subsets_skewness

In [41]:
dummy_sizes = regression_parameters['data_sizes'].copy()
for key in dummy_sizes.columns:
    dummy_sizes[key] = [dummy_sizes.loc[row, key]/dummy_sizes.loc['all', key] for row in dummy_sizes.index]
dummy_sizes.index = [f"{index}_ratio" for index in dummy_sizes.index]
dummy_sizes

Unnamed: 0,books,conspiracy,crypto,thedonald,politics
all_ratio,1.0,1.0,1.0,1.0,1.0
no_neutral_ratio,0.484925,0.622817,0.495681,0.586558,0.659382
threshold_2_ratio,0.33124,0.705397,0.702187,0.476578,0.801341
threshold_5_ratio,0.095687,0.483458,0.431637,0.364562,0.636044
threshold_10_ratio,0.044389,0.351645,0.32582,0.319756,0.554306
threshold_20_ratio,0.029523,0.247389,0.202524,0.295316,0.462314


In [43]:
regression_parameters['data_sizes'] = pd.concat((regression_parameters['data_sizes'], dummy_sizes))

In [45]:
with pd.ExcelWriter('logit_regression_params_v2.xlsx', engine='xlsxwriter') as writer:
    for key in regression_parameters:
        regression_parameters[key].to_excel(writer, sheet_name=key)