In [1]:
import pickle
import pandas as pd
import numpy as np
from regression_class import RedditRegression as RR
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [2]:
DATE = "09_04_2024"
RESULTS_DIR_PREFIX = f"regression_outputs/{DATE}_"
RESULTS_DIR_SUFFIX = "/results"
OUT_DIR_SUFFIX = "/outputs"
RUN_NAMES = ["c7_m7", "c7_m14", "c14_m7"]
LOGFILE = f"{RESULTS_DIR_PREFIX}_communal_processing"


OUT_DIR_COMBINED = f"regression_outputs/{DATE}_outputs"

In [3]:
all_filepaths = []
for run_name in RUN_NAMES:
    results_dir = RESULTS_DIR_PREFIX + run_name + RESULTS_DIR_SUFFIX

    filepaths = [f"{results_dir}/{x}" for x in os.listdir(results_dir)]
    all_filepaths += filepaths

In [4]:
modelled_thread_counts_dict = {
    'subreddit': [],
    'model window': [],
    'collection window': [],
    'thread size threshold': [],
    'cal modelled threads': [],
    'val modelled threads': [],
    'cal author threshold removed threads': [],
    'val author threshold removed threads': [],
    'cal thread size removed threads': [],
    'val thread size removed threads': [],
}

lookup_dict = {
    'cal modelled threads': ('cal', 'modelled_threads'),
    'val modelled threads': ('val', 'modelled_threads'),
    'cal author threshold removed threads': ('cal', 'author_all_activity_count_removed_threads'),
    'val author threshold removed threads': ('val', 'author_all_activity_count_removed_threads'),
    'cal thread size removed threads': ('cal', 'thread_size_removed_threads'),
    'val thread size removed threads': ('val', 'thread_size_removed_threads'),
}



    


In [5]:
for filepath in all_filepaths:
    regres = pickle.load(open(filepath, 'rb'))
    if regres.regression_params['regression_type'] != 'mnlogit':
        modelled_thread_counts_dict['subreddit'].append(regres.regression_params['name'])
        modelled_thread_counts_dict['model window'].append(regres.regression_params['model_window'])
        modelled_thread_counts_dict['collection window'].append(regres.regression_params['collection_window'])
        if 'thread_size' in regres.regression_params['thresholds']:
            thread_size_threshold = True
            modelled_thread_counts_dict['thread size threshold'].append(regres.regression_params['thresholds']['thread_size'])
        else:
            thread_size_threshold = False
            modelled_thread_counts_dict['thread size threshold'].append(0)
        regres.get_num_threads_modelled()
        for key in lookup_dict:
            i = lookup_dict[key][0]
            j = lookup_dict[key][1]
            if (j == 'thread_size_removed_threads') & (thread_size_threshold == False):
                modelled_thread_counts_dict[key].append(0)
            else:
                modelled_thread_counts_dict[key].append(regres.num_threads_modelled.loc[i,j])
                
    
    del regres
    gc.collect()

In [6]:
thread_count_df = pd.DataFrame.from_dict(modelled_thread_counts_dict, orient='index').T

In [7]:
to_output = thread_count_df.sort_values(by=['subreddit', 'model window', 'collection window', 'thread size threshold'])

In [8]:
with pd.ExcelWriter('regression_outputs/09042024_data_sizes.xlsx') as writer:
    to_output.to_excel(writer, index=False)