In [1]:
# data manipulation imports
import pandas as pd
import numpy as np

# data saving imports
import pickle

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass
from regression_class import QuantileClass as qc

# stats imports
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import metrics
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [17]:
REGRESSION_INFILE = "regression_thread_data.p"
THREAD_INFILE = "clean_5_thread_data.p"

regression_df = pickle.load(open(REGRESSION_INFILE, "rb"))
thread_df = pickle.load(open(THREAD_INFILE, "rb"))

del regression_df['thedonald']

In [18]:
for subreddit in regression_df:
      n_all = len(regression_df[subreddit])
      n_started = len(regression_df[subreddit][regression_df[subreddit].thread_size > 1])
      print(f"{subreddit}"
            f"\n    all threads: {n_all}"
            f"\n    threads with comments > 0: {n_started}"
            f"\n    % started: {(n_started/n_all)*100: .1f}%"
            )

books
    all threads: 4776
    threads with comments > 0: 1557
    % started:  32.6%
conspiracy
    all threads: 11395
    threads with comments > 0: 9675
    % started:  84.9%
crypto
    all threads: 14818
    threads with comments > 0: 6770
    % started:  45.7%
politics
    all threads: 65343
    threads with comments > 0: 42653
    % started:  65.3%


assuming collection window is 7 days, let's see how comment thresholding before feeding into regression class impacts first collection window when thresholding occurs. For this, lets look at model data sizes when 2 datasets are run through the reddit regression class: one thresholded, the other not.

In [22]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "log_author_all_activity_count",
]
y_col = "thread_size"

regression_params = {
    'regression_type': 'linear',
    'collection_window': 7,
    'model_window': 14,
    'validation_window': 7,
    'FSS': True,
    'performance_scoring_method': "r2",
    'x_cols': X_COLS,
    'y_col': y_col,
    'metrics': ['r2', 'aic'],
    'activity_threshold': 2,
}

In [27]:
started_regression_df = {}
started_threads = {}

for subreddit in regression_df:
    started_regression_df[subreddit] = regression_df[subreddit][regression_df[subreddit].thread_size > 1]
    started_threads[subreddit] = thread_df[subreddit][thread_df[subreddit].thread_id.isin(started_regression_df[subreddit].thread_id)]

In [28]:
regression_types = {
    'all': [regression_df, thread_df],
    'started': [started_regression_df, started_threads],
}

In [31]:
rr_objects = {}
for subreddit in regression_df:
    rr_objects[subreddit] = {}
    regression_params['name'] = subreddit
    for data_type in regression_types:
        regression_params['regression_data'] = regression_types[data_type][0][subreddit]
        regression_params['thread_data'] = regression_types[data_type][1][subreddit]
        rr_objects[subreddit][data_type] = RR(regression_params)

In [32]:
for subreddit in rr_objects:
    for data_type in rr_objects[subreddit]:
        rr_objects[subreddit][data_type].calc_author_thread_counts()

In [33]:
rr_objects[subreddit][data_type].thread_data

Unnamed: 0,thread_id,id,timestamp,author,sentiment_score,author_all_activity_count,author_post_count,author_comment_count,activity_ratio,mean_author_sentiment,log_author_all_activity_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude
0,j2wrx2,j2wrx2,2020-09-30 23:00:16,368049617b2f6233fda8e4dd508ef6885e039c24a5ef82a5,0.00000,0,0,0,0.0,0.000000,0.000000,0.0,0.000000
1,j2wrzn,j2wrzn,2020-09-30 23:00:22,04363c665bd3ee24460807537900330781798124089dc228,-0.68080,0,0,0,0.0,0.000000,0.000000,0.0,0.000000
3,j2wtrf,j2wtrf,2020-09-30 23:03:12,72ccbce357e4f02414777fdd06453264a16bb840a0852876,-0.19130,0,0,0,0.0,0.000000,0.000000,0.0,0.000000
7,j2ww0y,j2ww0y,2020-09-30 23:06:54,6f1a7c85dc4cfd102c6330ff75fe535b0e46fc8a53c133c5,-0.49390,0,0,0,0.0,0.000000,0.000000,0.0,0.000000
10,j2wx6r,j2wx6r,2020-09-30 23:08:49,36ad53750f1448e9ff37d5ad548ef99c0cb8204a13cdf0fa,0.00000,0,0,0,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6428325,jx3v1z,gcw06zq,2020-11-19 23:59:56,887403a40d5c8ca0fad132e615b249035e2c7920f66b2eb2,-0.55740,19,0,19,1.0,0.018476,2.995732,1.0,0.018476
6428326,jxe5u4,gcw06zx,2020-11-19 23:59:56,e4c0d2e9a4dd04243f794e8b2d582266e4adbf3b59c86b51,-0.24486,12,0,12,1.0,0.103779,2.564949,1.0,0.103779
6428327,jxcsim,gcw074d,2020-11-19 23:59:58,5a6e239d067b5f96602f611e8ef04d604410034ce78e7614,-0.39278,112,0,112,1.0,0.016228,4.727388,1.0,0.016228
6428328,jx8mgg,gcw0761,2020-11-19 23:59:58,31e0735567a426e1207c6111713568fb32e6e967c4a10766,0.01157,153,0,153,1.0,-0.058739,5.036953,-1.0,0.058739


In [36]:
for subreddit in rr_objects:
    print(f"{subreddit}")
    for data_type in rr_objects[subreddit]:
        print(
            f"    {data_type}: {len(rr_objects[subreddit][data_type].thread_data)}"
            )

books
    all: 92122
    started: 88903
conspiracy
    all: 377385
    started: 375665
crypto
    all: 407489
    started: 399441
politics
    all: 3639790
    started: 3617100


In [57]:
for subreddit in rr_objects:
    print(subreddit)
    thread_data = rr_objects[subreddit]['all'].thread_data
    started_thread_data = rr_objects[subreddit]['started'].thread_data
    thread_data_started_overlap = thread_data[thread_data.thread_id.isin(started_thread_data.thread_id)]
    print(f"    Thresholded before: {len(started_thread_data)} Thresholded after: {len(thread_data_started_overlap)}")
    n_thresholded = len(started_thread_data)

    # performing author thresholding
    n_thresholded_after = len(thread_data_started_overlap[thread_data_started_overlap.author_all_activity_count >= 2])
    n_thresholded_before = len(started_thread_data[started_thread_data.author_all_activity_count >= 2])
    print("    After author thresholding")
    print(f"    (before): {n_thresholded_before} (after): {n_thresholded_after}")
    print(f"    loss: {n_thresholded_after-n_thresholded_before} ({(n_thresholded_after-n_thresholded_before)*100/n_thresholded:.2f}%)")


books
    Thresholded before: 88903 Thresholded after: 88903
    After author thresholding
    (before): 13076 (after): 13192
    loss: 116 (0.13%)
conspiracy
    Thresholded before: 375665 Thresholded after: 375665
    After author thresholding
    (before): 211090 (after): 211179
    loss: 89 (0.02%)
crypto
    Thresholded before: 399441 Thresholded after: 399441
    After author thresholding
    (before): 254341 (after): 254709
    loss: 368 (0.09%)
politics
    Thresholded before: 3617100 Thresholded after: 3617100
    After author thresholding
    (before): 2213997 (after): 2215814
    loss: 1817 (0.05%)


Basically, it does mean there is a slight data loss, though not significant?