In [1]:
import pickle
import pandas as pd
from regression_class import RedditRegression as RR
from regression_class import TimestampClass
import numpy as np

In [2]:
REGRESSION_INFILE = "regression_thread_data_april_2024.p"
THREAD_INFILE = "clean_5_thread_data.p"

In [3]:
subreddit = 'books'
regression_type = 'logistic'

In [4]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "time_in_secs",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "log_author_all_activity_count",
    "domain_pagerank",
    "log_domain_count"
]

COLLECTION_WINDOW = 7
MODEL_WINDOW = 7

extra_params = {"collection_window": COLLECTION_WINDOW, "model_window": MODEL_WINDOW, "x_cols": X_COLS}

In [5]:
regression_df = pickle.load(open(REGRESSION_INFILE, 'rb'))
thread_df = pickle.load(open(THREAD_INFILE, 'rb'))

In [6]:
param_dict = RR.create_param_dict(subreddit, regression_type, regression_df[subreddit], thread_df[subreddit], **extra_params)

In [7]:
regmod = RR(param_dict)

In [8]:
regmod.calc_collection_counts()

In [8]:
regmod.main()

regression_class_books_logistic - INFO - Running FSS
regression_class_books_logistic - INFO - Model 1
regression_class_books_logistic - INFO - Model 2
regression_class_books_logistic - INFO - Model 3
regression_class_books_logistic - INFO - Model 4
regression_class_books_logistic - INFO - Model 5
regression_class_books_logistic - INFO - Model 6
regression_class_books_logistic - INFO - Model 7
regression_class_books_logistic - INFO - Model 8
regression_class_books_logistic - INFO - Model 9
regression_class_books_logistic - INFO - Model 10


In [17]:
regmod.FSS_metrics['metric_df'].feature_names[8]

('sentiment_sign',
 'sentiment_magnitude',
 'time_in_secs',
 'activity_ratio',
 'mean_author_sentiment_sign',
 'mean_author_sentiment_magnitude',
 'domain_pagerank',
 'log_domain_count')

In [11]:
regmod.thread_data.sort_values(by='domain_count', ascending=False).head(10)

Unnamed: 0,thread_id,id,timestamp,author,sentiment_score,domain,author_all_activity_count,author_post_count,author_comment_count,activity_ratio,mean_author_sentiment,domain_count,log_domain_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude
3093,y82761,y82761,2022-10-19 13:03:55,394ae74b5a1d408eac86de3df5311df3378212f1a2299782,-0.5423,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3109,y85pj8,y85pj8,2022-10-19 15:28:14,079ea617989a2defcec1f8e898ee80c476aa7fa4e75cdf30,0.0516,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3127,y8a1ib,y8a1ib,2022-10-19 18:18:21,bbca1b2f2d085a060ea5da2768ad437dc073c5a2ad19787d,0.0,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3126,y89yco,y89yco,2022-10-19 18:15:02,2492d896deb49ea9224f0f6acc9a7ab3295ac521d54ce843,-0.5719,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3125,y89rvi,y89rvi,2022-10-19 18:08:01,aba3bcb5fb96a4e38afd7b7d513c76c5395db6f0be46e215,0.3612,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3123,y88x8d,y88x8d,2022-10-19 17:34:19,48ee43acacac37e05350f955c2a5810a01622e24749a7af4,0.0,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3120,y88pp5,y88pp5,2022-10-19 17:26:07,6a014b7dba9c58de959f06e548a6a478523b40a98789bc9b,-0.2808,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3119,y88048,y88048,2022-10-19 16:58:37,eff5a85b7112e9682e38a633325a65049273e53a10f4e6a0,0.0,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3118,y87yke,y87yke,2022-10-19 16:56:52,92b2a8bcf69fd8f3749ca93c48dc291ff55a5fa5408df6cd,0.0,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0
3117,y87g9f,y87g9f,2022-10-19 16:36:34,c529743b0c31971d7d889b08c828a5538005563c1756e495,0.4215,self.books,0,0,0,0.0,0.0,1003,6.911747,0.0,0.0


In [10]:
regmod.thread_data.domain_count

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
111046   NaN
111047   NaN
111049   NaN
111050   NaN
111051   NaN
Name: domain_count, Length: 92122, dtype: float64

In [15]:
# create new cols in thread data df to store author data
new_cols = [
    "author_all_activity_count",
    "author_post_count",
    "author_comment_count",
    "activity_ratio",
    "mean_author_sentiment",
]

# establish condition for getting domain data
domain_condition = ("domain_count" in regmod.regression_params["x_cols"]) | (
    "log_domain_count" in regmod.regression_params["x_cols"]
)


# NOTE is creating these regardless of desired x_cols innefficient??

# to avoid warning dating when creating new cols
pd.options.mode.chained_assignment = None
for new_col in new_cols:
    regmod.thread_data[new_col] = 0



# need to go through each day after initial collection window up to end of data
for day in range(
    regmod.regression_params["collection_window"], len(regmod.date_array)
):

    # collection window is current day - collection window, up to (and
    # excluding) current day
    collection_window = regmod.date_array[
        day - regmod.regression_params["collection_window"] : day
    ]

    # find thread data in collection window
    collection_thread_data = regmod.thread_data[
        regmod.thread_data.timestamp.apply(TimestampClass.get_date).isin(
            collection_window
        )
    ]

    # find thread data on current day
    day_thread_data = regmod.thread_data[
        regmod.thread_data.timestamp.apply(TimestampClass.get_date)
        == regmod.date_array[day]
    ]

    # only need to look at collection data for authors that were active today
    # if looking at domains, then also need domain counts
    collection_condition = collection_thread_data.author.isin(
        day_thread_data.author
    )
    if domain_condition:
        collection_condition = (
            collection_condition
            | collection_thread_data.domain.isin(day_thread_data.domain)
        )

    collection_thread_data = collection_thread_data[collection_condition]

    if domain_condition:
        domain_count = (
            collection_thread_data[["id", "domain"]]
            .groupby("domain")
            .count()
            .rename(columns={"id": f"domain_count"})
        )

    # separate by activity
    thread_activity = {
        "all_activity": collection_thread_data,
        "post": collection_thread_data[
            collection_thread_data.thread_id == collection_thread_data.id
        ],
        "comment": collection_thread_data[
            collection_thread_data.thread_id != collection_thread_data.id
        ],
    }

    started = False
    for key in thread_activity:
        author_activity_count = (
            thread_activity[key][["author", "id"]]
            .groupby("author")
            .count()
            .rename(columns={"id": f"author_{key}_count"})
        )
        if not started:
            author_activity = author_activity_count
            started = True
        else:
            author_activity = (
                pd.concat((author_activity, author_activity_count), axis=1)
                .fillna(0)
                .astype(int)
            )

    # get activity ratio
    author_activity["activity_ratio"] = (
        author_activity.author_comment_count - author_activity.author_post_count
    ) / author_activity.author_all_activity_count

    # get mean sentiment score
    author_mean_sentiment = (
        thread_activity["all_activity"][["author", "sentiment_score"]]
        .groupby("author")
        .mean()
        .rename(columns={"sentiment_score": f"mean_author_sentiment"})
    )

    # combine to form author info df
    author_info = pd.concat((author_activity, author_mean_sentiment), axis=1)

    # convert to dict of mapping dicts to add to thread data from day
    author_info_maps = author_info.to_dict()
    if domain_condition:
        domain_maps = domain_count.to_dict()["domain_count"]
        day_thread_data["domain_count"] = day_thread_data.domain.map(
            domain_maps
        ).fillna(0)
        regmod.thread_data.loc[
            day_thread_data.index, "domain_count"
        ] = day_thread_data["domain_count"]

    # map author info to authors in today's thread data and update thread data
    # with today's author colleciton data
    for new_col in author_info_maps:
        day_thread_data[new_col] = day_thread_data.author.map(
            author_info_maps[new_col]
        ).fillna(0)
        regmod.thread_data.loc[day_thread_data.index, new_col] = day_thread_data[
            new_col
        ]



In [17]:
regmod.thread_data.domain_count.unique()

array([   0,  899,   13,   12,    6,    1,    4,    2,    9,  929,   10,
          7,    5,    3,   16,  945,  922,   21,   11,  917,   22,   14,
        955,    8,   23,  964,   25,  941,  939,   19,  970, 1003,  997,
        989], dtype=int64)

In [None]:
# get log of author activity if included
cols = ["log_author_all_activity_count", "log_domain_count"]
for col in [x for x in cols if x in regmod.regression_params["x_cols"]]:
    regmod.thread_data[col] = np.log(regmod.thread_data.loc[:, col] + 1)

# separate mean author sentiment into mag and sign
col = "mean_author_sentiment"
if col in regmod.thread_data.columns:
    (
        regmod.thread_data[f"{col}_sign"],
        regmod.thread_data[f"{col}_magnitude"],
    ) = regmod.separate_float_into_sign_mag(regmod.thread_data[col])