In [1]:
import pickle
import pandas as pd
import numpy as np
from regression_class import RedditRegression as RR
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

In [2]:
DATE = "09_04_2024"
RESULTS_DIR_PREFIX = f"regression_outputs/{DATE}_"
RESULTS_DIR_SUFFIX = "/results"
OUT_DIR_SUFFIX = "/outputs"
RUN_NAMES = ["c7_m7", "c7_m14", "c14_m7"]
LOGFILE = f"{RESULTS_DIR_PREFIX}_31052024"


OUT_DIR_COMBINED = f"regression_outputs/{DATE}_outputs"

In [3]:
all_filepaths = []
for run_name in RUN_NAMES:
    results_dir = RESULTS_DIR_PREFIX + run_name + RESULTS_DIR_SUFFIX

    filepaths = [f"{results_dir}/{x}" for x in os.listdir(results_dir) if 'books' not in x]
    all_filepaths += filepaths

In [4]:
mnlogit_filepaths = [x for x in all_filepaths if 'mnlogit' in x]
linear_filepaths = [x for x in all_filepaths if 'linear' in x]

In [5]:
minmax_aucs_dict = {
    'subreddit': [],
    'collection window': [],
    'calibration window': [],
    'min cal auc': [],
    'min cal num features': [],
    'max cal auc': [],
    'max cal num features': [],
    'mean cal auc': [],
    'min val auc': [],
    'min val num features': [],
    'max val auc': [],
    'max val num features': [],
    'mean val auc': [],
}

minmax_accuracy_dict = {
    'subreddit': [],
    'collection window': [],
    'calibration window': [],
    'min cal accuracy': [],
    'min cal num features': [],
    'max cal accuracy': [],
    'max cal num features': [],
    'mean cal accuracy': [],
    'min val accuracy': [],
    'min val num features': [],
    'max val accuracy': [],
    'max val num features': [],
    'mean val accuracy': [],
}

minmax_r2_dict = {
    'subreddit': [],
    'collection window': [],
    'calibration window': [],
    'min cal r2': [],
    'min cal num features': [],
    'max cal r2': [],
    'max cal num features': [],
    'mean cal r2': [],
    'min val r2': [],
    'min val num features': [],
    'max val r2': [],
    'max val num features': [],
    'mean val r2': [],
}

params_lookup_dict = {
    'subreddit': 'name',
    'collection window': 'collection_window',
    'calibration window': 'model_window',
}

auc_lookup_dict = {
    'min cal auc': ('cal_mnlogit_mean_auc', 'min'),
    'min cal num features': ('cal_mnlogit_mean_auc', 'idxmin'),
    'max cal auc': ('cal_mnlogit_mean_auc',  'max'),
    'max cal num features': ('cal_mnlogit_mean_auc', 'idxmax'),
    'mean cal auc': ('cal_mnlogit_mean_auc', 'mean'),
    'min val auc': ('val_mnlogit_mean_auc', 'min'),
    'min val num features': ('val_mnlogit_mean_auc', 'idxmin'),
    'max val auc': ('val_mnlogit_mean_auc', 'max'),
    'max val num features': ('val_mnlogit_mean_auc', 'idxmax'),
    'mean val auc': ('val_mnlogit_mean_auc', 'mean')
}

accuracy_lookup_dict = {
    'min cal accuracy': ('cal_mnlogit_accuracy', 'min'),
    'min cal num features': ('cal_mnlogit_accuracy', 'idxmin'),
    'max cal accuracy': ('cal_mnlogit_accuracy',  'max'),
    'max cal num features': ('cal_mnlogit_accuracy', 'idxmax'),
    'mean cal accuracy': ('cal_mnlogit_accuracy', 'mean'),
    'min val accuracy': ('val_mnlogit_accuracy', 'min'),
    'min val num features': ('val_mnlogit_accuracy', 'idxmin'),
    'max val accuracy': ('val_mnlogit_accuracy', 'max'),
    'max val num features': ('val_mnlogit_accuracy', 'idxmax'),
    'mean val accuracy': ('val_mnlogit_accuracy', 'mean'),
}

r2_lookup_dict = {
    'min cal r2': ('cal_r2', 'min'),
    'min cal num features': ('cal_r2', 'idxmin'),
    'max cal r2': ('cal_r2',  'max'),
    'max cal num features': ('cal_r2', 'idxmax'),
    'mean cal r2': ('cal_r2', 'mean'),
    'min val r2': ('val_r2', 'min'),
    'min val num features': ('val_r2', 'idxmin'),
    'max val r2': ('val_r2', 'max'),
    'max val num features': ('val_r2', 'idxmax'),
    'mean val r2': ('val_r2', 'mean'),
}

In [22]:
for i, filename in enumerate(mnlogit_filepaths + linear_filepaths):
    print(f"{i+1} of {len(mnlogit_filepaths + linear_filepaths)}")
    print(filename)
    redreg = pickle.load(open(filename, 'rb'))
    regparams = redreg.regression_params
    if filename in linear_filepaths:
        active_dicts = [minmax_r2_dict]
        lookup_dicts = [r2_lookup_dict]
    else:
        active_dicts = [minmax_aucs_dict, minmax_accuracy_dict]
        lookup_dicts = [auc_lookup_dict, accuracy_lookup_dict]
    
    print(active_dicts)
    for active_dict in active_dicts:
        for param in params_lookup_dict:
            active_dict[param].append(regparams[params_lookup_dict[param]])
    regmetrics = redreg.regression_metrics['metrics']
    for j, active_dict in enumerate(active_dicts):
        lookup_dict = lookup_dicts[j]
        for metric in [x for x in active_dict if x not in params_lookup_dict]:
            val_tuple = lookup_dict[metric]
            active_dict[metric].append(
            getattr(regmetrics[val_tuple[0]], val_tuple[1])()
        )


1 of 18
regression_outputs/09_04_2024_c7_m7/results/conspiracy_mnlogit.p
[{'subreddit': [], 'collection window': [], 'calibration window': [], 'min cal auc': [], 'min cal num features': [], 'max cal auc': [], 'max cal num features': [], 'mean cal auc': [], 'min val auc': [], 'min val num features': [], 'max val auc': [], 'max val num features': [], 'mean val auc': []}, {'subreddit': [], 'collection window': [], 'calibration window': [], 'min cal accuracy': [], 'min cal num features': [], 'max cal accuracy': [], 'max cal num features': [], 'mean cal accuracy': [], 'min val accuracy': [], 'min val num features': [], 'max val accuracy': [], 'max val num features': [], 'mean val accuracy': []}]
min cal auc ('cal_mnlogit_mean_auc', 'min')
min cal num features ('cal_mnlogit_mean_auc', 'idxmin')
max cal auc ('cal_mnlogit_mean_auc', 'max')
max cal num features ('cal_mnlogit_mean_auc', 'idxmax')
mean cal auc ('cal_mnlogit_mean_auc', 'mean')
min val auc ('val_mnlogit_mean_auc', 'min')
min val nu

In [27]:
dfs  = []
for results_dict in [minmax_aucs_dict, minmax_accuracy_dict, minmax_r2_dict]:
    df = pd.DataFrame.from_dict(results_dict, orient='index').T
    df = df.sort_values(by='subreddit')
    dfs.append(df)


In [38]:
with pd.ExcelWriter('regression_outputs/09_04_2024_results/linear_mnlogit_r2_aucs.xlsx') as writer:
    for df in dfs:
        val = df.columns[3].split()[2]
        if (val == 'auc') | (val == 'accuracy'):
            val = 'mnlogit_' + val
        else:
            val = 'linear_' + val
        df.to_excel(writer, index=False, sheet_name=val)

In [10]:
OUTDIR = 'regression_outputs/09_04_2024_results/'

In [6]:
def get_feature_list(feature_names:pd.Series):

    feature_list = []
    for i, feat_tuple in enumerate(feature_names):

        if i == 0:
            feature_list.append(feat_tuple[0])
        else:
            new_feature = [x for x in feat_tuple if x not in feature_list]
            feature_list += new_feature


    return feature_list

In [12]:
features_dict = {}

In [13]:
filepaths = {
    'linear': linear_filepaths,
    'mnlogit': mnlogit_filepaths
}

features_dict = {'linear': {'conspiracy': {}, 'crypto': {}, 'politics': {}}, 'mnlogit': {'conspiracy': {}, 'crypto': {}, 'politics': {}}}

for filetype in filepaths:
    print(filetype)
    for i, filename in enumerate(filepaths[filetype]):
        print(f"{i+1} of {len(filepaths[filetype])}")
        print(filename)
        redreg = pickle.load(open(filename, 'rb'))
        regparams = redreg.regression_params
        name = regparams['name']
        col_case = f"c{regparams['collection_window']}_m{regparams['model_window']}"
        print(redreg.FSS_metrics['metric_df'].feature_names)
        features_dict[filetype][name][col_case] = get_feature_list(redreg.FSS_metrics['metric_df'].feature_names)

linear
1 of 9
regression_outputs/09_04_2024_c7_m7/results/conspiracy_linear.p
1                                       (domain_count,)
2                       (domain_pagerank, domain_count)
3     (mean_author_sentiment_sign, domain_pagerank, ...
4     (time_in_secs, mean_author_sentiment_sign, dom...
5     (time_in_secs, mean_author_sentiment_sign, mea...
6     (time_in_secs, activity_ratio, mean_author_sen...
7     (sentiment_sign, time_in_secs, activity_ratio,...
8     (sentiment_sign, time_in_secs, num_dayofweek, ...
9     (sentiment_sign, sentiment_magnitude, time_in_...
10    (sentiment_sign, sentiment_magnitude, time_in_...
Name: feature_names, dtype: object
2 of 9
regression_outputs/09_04_2024_c7_m7/results/crypto_linear.p
1                                     (activity_ratio,)
2                        (activity_ratio, domain_count)
3          (time_in_secs, activity_ratio, domain_count)
4     (time_in_secs, activity_ratio, mean_author_sen...
5     (time_in_secs, activity_ratio,

In [15]:
features_dict

{'linear': {'conspiracy': {'c7_m7': ['domain_count',
    'domain_pagerank',
    'mean_author_sentiment_sign',
    'time_in_secs',
    'mean_author_sentiment_magnitude',
    'activity_ratio',
    'sentiment_sign',
    'num_dayofweek',
    'sentiment_magnitude',
    'author_all_activity_count'],
   'c7_m14': ['author_all_activity_count',
    'time_in_secs',
    'activity_ratio',
    'domain_pagerank',
    'domain_count',
    'sentiment_magnitude',
    'num_dayofweek',
    'mean_author_sentiment_sign',
    'mean_author_sentiment_magnitude',
    'sentiment_sign'],
   'c14_m7': ['mean_author_sentiment_magnitude',
    'time_in_secs',
    'author_all_activity_count',
    'domain_pagerank',
    'domain_count',
    'sentiment_magnitude',
    'activity_ratio',
    'mean_author_sentiment_sign',
    'sentiment_sign',
    'num_dayofweek']},
  'crypto': {'c7_m7': ['activity_ratio',
    'domain_count',
    'time_in_secs',
    'mean_author_sentiment_magnitude',
    'author_all_activity_count',
    'se

In [16]:
with pd.ExcelWriter(f'{OUTDIR}/mnlogit_linear_features.xlsx') as writer:
    for filetype in features_dict:
        for subreddit in features_dict[filetype]:
            df = pd.DataFrame.from_dict(features_dict[filetype][subreddit])
            df = df.reset_index(names='feature')
            df['feature'] = df.feature + 1
            df.to_excel(writer, sheet_name=f'{subreddit}_{filetype}', index=False)