In [2]:
# standard imports
import gc

# multiprocessing
from multiprocessing import Pool

# data manipulation imports
import numpy as np
import pandas as pd

# data saving imports
import pickle
import os

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass

In [3]:
# the other 26 file (.py) must be run to run all required regressions

# outfiles
metrics_outfile = "regression_metrics"

# infile parent dir
working_dir = 'logistic_regression/logregs_26102023'

In [4]:
def nice_format_xticks(x_vals_series):
    all_xvals = list(x_vals_series)
    x_vals = []

    for list_xvals in all_xvals:
        for i in list_xvals:
            if i not in x_vals:
                x_vals.append(i)

    separated_out = [x.split('_') for x in x_vals]
    xstrings = []
    for i in separated_out:
        if len(i) == 1:
            xstrings.append(i[0])
        elif len(i) == 2:
            xstrings.append(f'{i[0]}\n{i[1]}')
        else:
            divider = int(len(i)/2)-1
            xstring = ""
            for j in i:
                xstring += j
                if j == i[divider]:
                    xstring += '\n'
                elif j != j[-1]:
                    xstring += ' '
            xstrings.append(xstring)
    return xstrings

In [5]:
def get_regression_params_from_pickles(infile_path):
    collection_window_size = int(infile_path.split('_')[-1].strip('.p'))
    print(f"\n  Collection window: {collection_window_size}")
    print(f"\n  reading in {infile_path}")
    regression_infile = pickle.load(open(infile_path, 'rb'))
    for subreddit in regression_infile['logregs']:
        plotting_metrics[subreddit] = {'collection_window': collection_window_size}
        plotting_metrics[subreddit]['index'] = (
            regression_infile['logregs'][subreddit].regression_metrics[1]["metrics"].index
        )
        plotting_metrics[subreddit]['auc'] = (
            regression_infile['logregs'][subreddit]
            .regression_metrics[1]["metrics"].loc[:, 'auc']
        )
        plotting_metrics[subreddit]['x_names'] = (
            nice_format_xticks(
                regression_infile['logregs'][subreddit]
                .regression_metrics[1]["metrics"].model.apply(
                    regression_infile['logregs'][subreddit].get_x_vals_from_modstring
                )
            )
        )
    del regression_infile
    gc.collect()
    return plotting_metrics

In [None]:
# get infile dirs
infile_dirs = [f"{working_dir}/{x}" for x in os.listdir(working_dir) if os.path.isdir(f"{working_dir}/{x}")]

for infile_dir in infile_dirs:
    activity_threshold = int(infile_dir[-1])
    print(f"\n\nActivity threshold: {activity_threshold}")
    infiles = [x for x in os.listdir(infile_dir) if (not os.path.isdir(f"{infile_dir}/{x}")) & (not x.startswith('lite'))]
    for infile in infiles:
        collection_window_size = int(infile.split('_')[-1].strip('.p'))
        print(f"\n  Collection window: {collection_window_size}")
        print(f"\n  reading in {infile}")
        regression_infile = pickle.load(open(f"{infile_dir}/{infile}", 'rb'))

        # delete unneccessary data to save memory and write out "lite" infiles
        for key in ['regression_data', 'thread_data']:
            del regression_infile['regression_params'][key]
        
        pickle.dump(regression_infile, open(f"{infile_dir}/lite_{infile}", 'wb'))
        del regression_infile
        gc.collect()



Activity threshold: 0

  Collection window: 1

  reading in logregs_a_0_c_1.p

  Collection window: 14

  reading in logregs_a_0_c_14.p

  Collection window: 3

  reading in logregs_a_0_c_3.p

  Collection window: 7

  reading in logregs_a_0_c_7.p


Activity threshold: 1

  Collection window: 1

  reading in logregs_a_1_c_1.p

  Collection window: 14

  reading in logregs_a_1_c_14.p

  Collection window: 3

  reading in logregs_a_1_c_3.p

  Collection window: 7

  reading in logregs_a_1_c_7.p


Activity threshold: 2

  Collection window: 1

  reading in logregs_a_2_c_1.p

  Collection window: 14

  reading in logregs_a_2_c_14.p

  Collection window: 3

  reading in logregs_a_2_c_3.p

  Collection window: 7

  reading in logregs_a_2_c_7.p


Activity threshold: 5

  Collection window: 14

  reading in logregs_a_5_c_14.p

  Collection window: 3

  reading in logregs_a_5_c_3.p

  Collection window: 7

  reading in logregs_a_5_c_7.p


In [6]:
# get infile dirs
infile_dirs = [f"{working_dir}/{x}" for x in os.listdir(working_dir) if os.path.isdir(f"{working_dir}/{x}")]

# iterate through all the infile directories
regressions = {}

for infile_dir in infile_dirs:
    activity_threshold = int(infile_dir[-1])
    print(f"\n\nActivity threshold: {activity_threshold}")
    infiles = [f"{infile_dir}/{x}" for x in os.listdir(infile_dir) if (not os.path.isdir(f"{infile_dir}/{x}")) & (x.startswith('lite'))]

    print('Starting Pool')
    with Pool() as pool:
        regressions[activity_threshold] = pool.map(get_regression_params_from_pickles, infiles)






Activity threshold: 0
Starting Pool
