In [1]:
# custom imports
from regression_class import RedditRegression as RR

# data saving imports
import pickle
import os

# time tracking
from datetime import datetime as dt

In [2]:
# TESTING
TEST_INFILE = "test_data_4_days.p"
test_data = pickle.load(open(TEST_INFILE, 'rb'))
regression_df = test_data["regression_data"]
thread_df = test_data["all_data"]

# REGRESSION_INFILE = "regression_thread_data.p"
# THREAD_INFILE = "clean_5_thread_data.p"
# regression_df = pickle.load(open(REGRESSION_INFILE, "rb"))
# thread_df = pickle.load(open(THREAD_INFILE, "rb"))



In [3]:
# subreddits to look at
subreddits = ["books", "crypto", "conspiracy", "politics"]

# regression types to run
regression_types = ["logistic", "linear", "mnlogit"]

In [4]:
params = RR.create_param_dict("books", "logistic", regression_df["books"], thread_df["books"], model_window=2, collection_window=1, validation_window=1)

In [5]:
rreg = RR(params)
rreg.main()

regression_class_books_logistic - INFO - Running FSS
regression_class_books_logistic - INFO - Model 1
regression_class_books_logistic - INFO - Model 2
regression_class_books_logistic - INFO - Model 3
regression_class_books_logistic - INFO - Model 4
regression_class_books_logistic - INFO - Model 5
regression_class_books_logistic - INFO - Model 6
regression_class_books_logistic - INFO - Model 7
regression_class_books_logistic - INFO - Model 8


In [11]:
rreg.regression_model_data

Unnamed: 0,thread_id,thread_size,authors,timestamp,author,score,subject_sentiment_score,sentiment_sign,sentiment_magnitude,success,author_all_activity_count,activity_ratio,mean_author_sentiment_sign,mean_author_sentiment_magnitude,time_in_secs,num_dayofweek


In [4]:
print("Creating parameter dictionaries")
# fixed regression params
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "time_in_secs",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "author_all_activity_count",
]

fixed_regression_params = {
    "collection_window": 1,
    "model_window": 2,
    "validation_window": 1,
    "FSS": True,
    "x_cols": X_COLS,
    "scale": True,
}

# variable regression params
quantiles = [0.25, 0.5, 0.75]
thresholds2 = {
    "author_all_activity_count": 2,
    "thread_size": 2,
}
thresholds1 = {
    "author_all_activity_count": 2,
}

to_vary = {
    "regression_type": regression_types,
    "y_col": ["success", "thread_size", "thread_size"],
    "metrics": [
        ["auc"],
        ["r2"],
        ["mnlogit_accuracy", "mnlogit_aucs", "mnlogit_mean_auc"],
    ],
    "thresholds": [thresholds1, thresholds2, thresholds2],
    "quantiles": [[], [], quantiles],
}

regression_params = {}
for i, regtype in enumerate(regression_types):
    regression_params[regtype] = fixed_regression_params.copy()
    for key in to_vary:
        regression_params[regtype][key] = to_vary[key][i]

Creating parameter dictionaries


In [7]:
regression_params['linear']

{'collection_window': 1,
 'model_window': 2,
 'validation_window': 1,
 'FSS': True,
 'x_cols': ['sentiment_sign',
  'sentiment_magnitude',
  'hour',
  'time_in_secs',
  'num_dayofweek',
  'activity_ratio',
  'mean_author_sentiment_sign',
  'mean_author_sentiment_magnitude',
  'author_all_activity_count'],
 'scale': True,
 'regression_type': 'linear',
 'y_col': 'thread_size',
 'metrics': ['r2'],
 'thresholds': {'author_all_activity_count': 2, 'thread_size': 2},
 'quantiles': []}

In [8]:
# place to store logregs
subreddit_regression_params = {}
for subreddit in subreddits:
    print(f"####### {subreddit} #######")
    subreddit_regression_params[subreddit] = {}
    for regtype in regression_params:
        subreddit_regression_params[subreddit][regtype] = regression_params[regtype].copy()
        subreddit_regression_params[subreddit][regtype]["name"] = subreddit
        subreddit_regression_params[subreddit][regtype]["regression_data"] = regression_df[subreddit]
        subreddit_regression_params[subreddit][regtype]["thread_data"] = thread_df[subreddit]

####### books #######
####### crypto #######
####### conspiracy #######
####### politics #######


In [9]:
OUTDIR = "regression_test_outputs"
outfile = f"{OUTDIR}/input_params.p"
pickle.dump(subreddit_regression_params, open(outfile, 'wb'))