In [7]:
# standard imports
import gc

# multiprocessing
from multiprocessing import Pool

# data manipulation imports
import numpy as np
import pandas as pd

# data saving imports
import pickle
import os

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass

In [8]:
# the other 26 file (.py) must be run to run all required regressions

# outfiles
metrics_outfile = "regression_metrics"

# infile parent dir
working_dir = 'logistic_regression/logregs_26102023'

In [10]:
def get_regression_params_from_pickles(infile_path):
    collection_window_size = int(infile_path.split('_')[-1].strip('.p'))
    print(f"\n  Collection window: {collection_window_size}")
    print(f"\n  reading in {infile_path}")
    regression_infile = pickle.load(open(infile_path, 'rb'))
    regression
    return regression_infile

In [9]:
# get infile dirs
infile_dirs = [f"{working_dir}/{x}" for x in os.listdir(working_dir) if os.path.isdir(f"{working_dir}/{x}")]

for infile_dir in infile_dirs:
    activity_threshold = int(infile_dir[-1])
    print(f"\n\nActivity threshold: {activity_threshold}")
    infiles = [x for x in os.listdir(infile_dir) if (not os.path.isdir(f"{infile_dir}/{x}")) & (not x.startswith('lite'))]
    for infile in infiles:
        collection_window_size = int(infile.split('_')[-1].strip('.p'))
        print(f"\n  Collection window: {collection_window_size}")
        print(f"\n  reading in {infile}")
        regression_infile = pickle.load(open(f"{infile_dir}/{infile}", 'rb'))

        # delete unneccessary data to save memory and write out "lite" infiles
        for key in ['regression_data', 'thread_data']:
            del regression_infile['regression_params'][key]
        
        pickle.dump(regression_infile, open(f"{infile_dir}/lite_{infile}", 'wb'))
        del regression_infile
        gc.collect()



Activity threshold: 0

  Collection window: 1

  reading in logregs_a_0_c_1.p

  Collection window: 14

  reading in logregs_a_0_c_14.p

  Collection window: 3

  reading in logregs_a_0_c_3.p

  Collection window: 7

  reading in logregs_a_0_c_7.p


Activity threshold: 1

  Collection window: 1

  reading in logregs_a_1_c_1.p

  Collection window: 14

  reading in logregs_a_1_c_14.p

  Collection window: 3

  reading in logregs_a_1_c_3.p

  Collection window: 7

  reading in logregs_a_1_c_7.p


Activity threshold: 2

  Collection window: 1

  reading in logregs_a_2_c_1.p

  Collection window: 14

  reading in logregs_a_2_c_14.p

  Collection window: 3

  reading in logregs_a_2_c_3.p

  Collection window: 7

  reading in logregs_a_2_c_7.p


Activity threshold: 5

  Collection window: 14

  reading in logregs_a_5_c_14.p

  Collection window: 3

  reading in logregs_a_5_c_3.p

  Collection window: 7

  reading in logregs_a_5_c_7.p


In [11]:
# iterate through all the infile directories
regressions = {}

for infile_dir in infile_dirs:
    activity_threshold = int(infile_dir[-1])
    print(f"\n\nActivity threshold: {activity_threshold}")
    infiles = [f"{infile_dir}/{x}" for x in os.listdir(infile_dir) if (not os.path.isdir(f"{infile_dir}/{x}")) & (x.startswith('lite'))]

    with Pool() as pool:
        regressions[activity_threshold] = pool.map(get_regression_params_from_pickles, infiles)






Activity threshold: 0
