In [1]:
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from itertools import groupby

In [2]:
regression_infile = "regression_thread_data.p"
thread_infile = 'clean_5_thread_data.p'
remove = 'thedonald'
regression_outfile = "weekly_regression_thread_data.p"


In [3]:
regression_data = pickle.load(open(regression_infile, 'rb'))
thread_data = pickle.load(open(thread_infile, 'rb'))

if remove:
    regression_data.pop(remove)

In [130]:
def divide_into_weeks(df):
    df.sort_values('timestamp', inplace=True, ignore_index=True)
    df['month'] = df.timestamp.dt.month
    df['day'] = df.timestamp.dt.day

    daylist = [i[0] for i in groupby(df['day'].values)]
    days = {}
    weeknum = 0


    for i in range(0, len(daylist)-8, 7):
        days[weeknum] = daylist[i:i+7]
        weeknum += 1


    working_df = df.copy()
    weeks = {}
    for weeknum in days:
        start = df[df.day == days[weeknum][0]].index[0]
        lastday = days[weeknum][-1]
        firstmonth = df[df.day == lastday].month.unique()[0]
        end = df[
            (df.day == lastday) &
            (df.month == firstmonth)
        ].index[-1]
        weeks[weeknum] = working_df.loc[start:end, :].drop(labels=['month', 'day'], axis=1)
        working_df = working_df.loc[end+1:, :]

    return weeks



In [131]:
weekly_regression_data = {}
weekly_thread_data = {}

for key in regression_data:
    weekly_regression_data[key] = divide_into_weeks(regression_data[key])
    weekly_thread_data[key] = divide_into_weeks(thread_data[key])

In [132]:


for subr in weekly_regression_data:
    for week_num in weekly_regression_data[subr]:
        all_activity = weekly_thread_data[subr][week_num]
        comments = all_activity[all_activity.thread_id != all_activity.id]
        posts = all_activity[all_activity.thread_id == all_activity.id]

        data_subsets = {
            'all_activity': all_activity,
            'comments': comments,
            'posts': posts
        }

        started = False

        for key in data_subsets:
            count = data_subsets[key][['author', 'id']].groupby('author').count().rename(columns={'id': key})
            if not started:
                activity_count = count
                started = True
            else:
                activity_count = pd.concat((activity_count, count), axis=1).fillna(0).astype(int)
        
        weekly_regression_data[subr][week_num] = weekly_regression_data[subr][week_num].merge(activity_count.reset_index(), on='author')
        weekly_regression_data[subr][week_num]['activity_ratio'] = ((
            weekly_regression_data[subr][week_num].comments - weekly_regression_data[subr][week_num].posts)
            /weekly_regression_data[subr][week_num].all_activity)




In [135]:
pickle.dump(weekly_regression_data, open(regression_outfile, 'wb'))