In [1]:
import pickle
import pandas as pd
import numpy as np
from regression_class import RedditRegression as RR
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [2]:
RESULTS_DIR = 'regression_outputs/18_12_2024_c14_m7/results'

In [3]:
filenames = os.listdir(RESULTS_DIR)
path_list = [f'{RESULTS_DIR}/{x}' for x in filenames if x.endswith('.p')]

In [4]:
modelled_thread_counts_dict = {
    'subreddit': [],
    'model window': [],
    'collection window': [],
    'thread size threshold': [],
    'cal modelled threads': [],
    'val modelled threads': [],
    'cal author threshold removed threads': [],
    'val author threshold removed threads': [],
    'cal thread size removed threads': [],
    'val thread size removed threads': [],
}

lookup_dict = {
    'cal modelled threads': ('cal', 'modelled_threads'),
    'val modelled threads': ('val', 'modelled_threads'),
    'cal author threshold removed threads': ('cal', 'author_all_activity_count_removed_threads'),
    'val author threshold removed threads': ('val', 'author_all_activity_count_removed_threads'),
    'cal thread size removed threads': ('cal', 'thread_size_removed_threads'),
    'val thread size removed threads': ('val', 'thread_size_removed_threads'),
}


def get_modelled_thread_counts(regres):
    if regres.regression_params['regression_type'] != 'mnlogit':
        modelled_thread_counts_dict['subreddit'].append(regres.regression_params['name'])
        modelled_thread_counts_dict['model window'].append(regres.regression_params['model_window'])
        modelled_thread_counts_dict['collection window'].append(regres.regression_params['collection_window'])
        if 'thread_size' in regres.regression_params['thresholds']:
            thread_size_threshold = True
            modelled_thread_counts_dict['thread size threshold'].append(regres.regression_params['thresholds']['thread_size'])
        else:
            thread_size_threshold = False
            modelled_thread_counts_dict['thread size threshold'].append(0)
        regres.get_num_threads_modelled()
        for key in lookup_dict:
            i = lookup_dict[key][0]
            j = lookup_dict[key][1]
            if (j == 'thread_size_removed_threads') & (thread_size_threshold == False):
                modelled_thread_counts_dict[key].append(0)
            else:
                modelled_thread_counts_dict[key].append(regres.num_threads_modelled.loc[i,j])


In [5]:
def find_metrics_for_plotting(result_pickle):
    param_metrics = result_pickle.regression_params["metrics"]
    metric_cols = result_pickle.regression_metrics["metrics"].columns
    return [x for x in param_metrics if x != "mnlogit_aucs"], metric_cols

In [6]:
import matplotlib.pyplot as plt

SMALL_SIZE = 16
MEDIUM_SIZE = 20
BIGGER_SIZE = 28

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rc('axes', titlelocation='left')

In [7]:
title_map = {
    'crypto': '(b) CryptoCurrency',
    'conspiracy': '(a) Conspiracy',
    'politics': '(c) politics'
}
ylabel_map = {
    'r2': 'R-squared',
    'auc': 'AUC',
    'mnlogit_accuracy': 'Accuracy'
}

In [8]:
def get_feature_list(feature_names:pd.Series):

    feature_list = []
    for i, feat_tuple in enumerate(feature_names):

        if i == 0:
            feature_list.append(feat_tuple[0])
        else:
            new_feature = [x for x in feat_tuple if x not in feature_list]
            feature_list += new_feature


    return feature_list

In [10]:
# load pickles
dates = {}
legend_fig = True
features_dict = {'conspiracy': {}, 'crypto': {}, 'politics': {}}
for filename in path_list:
    print(f'## {filename}')
    key = filename.split('/')[-1].removesuffix('.p')
    print(" Reading in")
    regres = pickle.load(open(f"{filename}", 'rb'))
    print(" Outputting to excel")
    regres.output_to_excel(f"{RESULTS_DIR}/{key}.xlsx")
    print(" Getting dates")
    date_array = regres.date_array
    dates[key] = {'start': date_array.loc[0], 'end': date_array.iloc[-1], 'days': len(date_array)}

    # get model threads count
    get_modelled_thread_counts(regres)

    # get FSS feature lists
    name = regres.regression_params['name']
    regtype = regres.regression_params['regression_type']
    features_dict[name][f'{regtype}_feat'] = get_feature_list(
        regres.FSS_metrics['metric_df'].feature_names
    )
    for colname in [
        x for x in regres.regression_metrics['metrics'].columns if (
            ('cal' in x) or ('val' in x)
        )]:
        features_dict[name][f"{regtype}_{colname}"] = list(regres.regression_metrics['metrics'][colname])
    
    # plot metrics
    metric_list, metric_cols = find_metrics_for_plotting(regres)
    for metric in metric_list:
        metrics_to_plot = [x for x in metric_cols if metric in x]
        if metric in ylabel_map:
            ylabel = ylabel_map[metric]
        else:
            ylabel = metric
        regres.plot_metrics_vs_features(
            metrics_to_plot,
            ylabel,
            name=f"{key}",
            outfile=f"{RESULTS_DIR}/{key}_{metric}.png",
            show=False,
            legend=False,
            title = title_map[name],
        )

        if not legend_fig:
            regres.plot_metrics_vs_features(
                metrics_to_plot, 
                metric,
                labels=['calibration', 'validation'],
                name=f"{key}",
                outfile=f"{RESULTS_DIR}/{key}_{metric}_with_legend.png",
                show=False,
                legend_loc=(0.7,0.6)
            )
            legend_fig=True



## regression_outputs/18_12_2024_c14_m7/results/conspiracy_mnlogit.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/politics_mnlogit.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/conspiracy_logistic.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/politics_linear.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/crypto_logistic.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/crypto_mnlogit.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/crypto_linear.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/politics_logistic.p
 Reading in
 Outputting to excel
 Getting dates
## regression_outputs/18_12_2024_c14_m7/results/conspiracy_linear.p
 Reading in
 

<Figure size 640x480 with 0 Axes>

In [11]:
feature_name_lookup = {
    'domain_count': 'Post domain count',
    'author_all_activity_count': 'Author activity count',
    'mean_author_sentiment_magnitude': 'Author mean sentiment magnitude',
    'mean_author_sentiment_sign': 'Author mean sentiment sign',
    'mean_author_score': 'Author mean score',
    'domain_pagerank': 'Post domain PageRank',
    'activity_ratio': 'Author activity ratio',
    'time_in_secs': 'Time of day',
    'sentiment_sign': 'Post sentiment sign',
    'sentiment_magnitude': 'Post sentiment magnitude',
    'num_dayofweek': 'Day of week',
    'weekday': 'Weekend',
    'time_of_day': 'Time of Day',
    'external_domain': 'External domain'
}

In [12]:
print("Saving dates to csv")
# output dates
dates_df = pd.DataFrame.from_dict(dates, orient='index')

# output thread counts
thread_count_df = (pd.DataFrame.from_dict(modelled_thread_counts_dict, orient='index').T).sort_values(by=['subreddit', 'model window', 'collection window', 'thread size threshold'])


with pd.ExcelWriter(f'{RESULTS_DIR}/dataset_info.xlsx') as writer:
    dates_df.to_excel(writer, sheet_name='dates')
    thread_count_df.to_excel(writer, sheet_name='sizes', index=False)
    for subreddit in features_dict:
        df = pd.DataFrame.from_dict(features_dict[subreddit])
        df.replace(feature_name_lookup, inplace=True)
        df = df.reset_index(names='feature')
        df['feature'] = df.feature + 1
        df.to_excel(writer, sheet_name=f'{subreddit}_feats', index=False)

Saving dates to csv
