In [4]:
import numpy as np
import pandas as pd
from reddit_dataclass import RedditData as reddit
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import scipy.stats as scpstat
import matplotlib.dates as dates
import datetime
from sklearn import metrics
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from itertools import groupby
import os


# for feature selection
from sklearn import linear_model
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [1]:
X_COLS = ['activity_ratio', 'mean_author_sentiment_magnitude',
              'mean_author_sentiment_sign', 'hour', 'sentiment_sign',
              'sentiment_magnitude', 'num_dayofweek']

In [2]:
params = {
    'regression_infile': "regression_thread_data.p",
    'thread_infile': 'clean_5_thread_data.p',
    'collection window': 7,
    'model window': 7,
    'step': 7,
    'performance scoring method': 'roc_auc',
    'x_cols': X_COLS,
    'y_col': 'success'
}

outdir = 'feature_selection_1'
regression_params_outfile = 'regression_params.xlsx'
regression_metrics_outfile = 'regression_metrics.xlsx'
pickle_outfile = 'regression_data.p'

In [5]:
if not os.path.isdir(outdir):
    os.mkdir(outdir)

In [6]:
params_df = pd.DataFrame.from_dict(params, orient='index').rename(columns={0: 'input'})
params_df.index.name = 'param'

In [9]:
def get_date(timestamp):
    return timestamp.date()

def get_score(row):
    if row.thread_id == row.id:
        return row.subject_sentiment_score
    else:
        return row.body_sentiment_score
    
def float_seconds(time_row):
    hours = time_row.hour
    minutes = time_row.minute
    seconds = time_row.second
    return hours*60*60 + minutes*60 + seconds

def get_dayofweek(timestamp):
    return timestamp.dayofweek

def get_hour(timestamp):
    return timestamp.hour

column_functions = {
    'time_in_secs': float_seconds,
    'num_dayofweek': get_dayofweek,
    'hour': get_hour
}

def get_thread_collection_data(thread_data, date_array, date_index, collection_window=params['collection window']):

    # get collection dates
    collection_dates = date_array[date_index:date_index+collection_window]

    # get thread data in collection dates
    thread_collection_data = thread_data[thread_data.timestamp.apply(get_date).isin(collection_dates)]

    # separate by activity
    thread_activity = {
        'all_activity': thread_collection_data,
        'post': thread_collection_data[thread_collection_data.thread_id == thread_collection_data.id],
        'comment': thread_collection_data[thread_collection_data.thread_id != thread_collection_data.id]
    }

    return thread_activity

def get_regression_model_data(regression_data,
    date_array, date_index,
    collection_window=params['collection window'],
    model_window=params['model window']):

    model_dates = date_array[
        date_index + collection_window : date_index + collection_window + model_window
        ]
    
    return regression_data[regression_data.timestamp.apply(get_date).isin(model_dates)]


def get_sm_models(x_list_dict, y=params['y_col']):
    models = {}
    for feat_num in x_list_dict:
        models[feat_num] = f"{y} ~"
        for i, feat_name in enumerate(x_list_dict[feat_num]):
            if i != 0:
                models[feat_num] += ' +'
            models[feat_num] += f' {feat_name}'
    return models

def get_author_activity_counts_and_sentiment_means(collection_thread_data_dict):
    started = False
    for key in collection_thread_data_dict:
        author_activity_count = collection_thread_data_dict[key][['author', 'id']].groupby('author').count().rename(columns={'id': f'author_{key}_count'})
        if not started:
            author_activity = author_activity_count
            started = True
        else:
            author_activity = pd.concat((author_activity, author_activity_count), axis=1).fillna(0).astype(int)
        #author_mean_sentiment[key] = collection_thread_data_dict[key][['author', 'sentiment_score']].groupby('author').mean().rename(columns={'sentiment_score': f'author_{key}_mean_sentiment'})

    author_activity['activity_ratio'] = (
        (author_activity.author_comment_count - author_activity.author_post_count)/
        author_activity.author_all_activity_count
    )

    author_mean_sentiment = collection_thread_data_dict['all_activity'][['author', 'sentiment_score']].groupby('author').mean().rename(columns={'sentiment_score': f'mean_author_sentiment'})

    # combine to form author info df
    return pd.concat((author_activity[['author_all_activity_count', 'activity_ratio']], author_mean_sentiment), axis=1)


In [None]:
def get_regression_data_by_time_period(relevant_thread_data, subreddit_regression_data, subreddit_FSS_metrics, date_array, date_index, period_counter):
    # get thread data for collection time period
    # as dict for activity breakdown
    collection_thread_data_by_activity = get_thread_collection_data(relevant_thread_data, date_array, date_index)

    # get activity counts and sentiment means
    author_data = get_author_activity_counts_and_sentiment_means(collection_thread_data_by_activity)
    
    # get regression model data
    regression_model_data = get_regression_model_data(subreddit_regression_data, date_array, date_index)
    
    # combine collected author info data from collection period with model data in model time period
    regression_model_data = regression_model_data.merge(author_data.reset_index(), on='author')

    # separate mean author sentiment into magnitude and sign
    col = 'mean_author_sentiment'
    regression_model_data[f'{col}_sign'] = np.sign(regression_model_data[col])
    regression_model_data[f'{col}_magnitude'] = np.absolute(regression_model_data[col])

    # make other required cols
    for col in [x for x in column_functions if x in X_COLS]:
        regression_model_data[col] = regression_model_data.timestamp.apply(column_functions[col])
    
    # RUN WHATEVER FUNCTION IS NEEDED
    # get X lists for models to run (output from FSS)
    x_mods_to_run = get_x_feats_to_run(subreddit_FSS_metrics, period_counter)

    # make dict of statsmodels format models to run
    sm_model_strings = get_sm_models(x_mods_to_run)

    out_dict = {
        'model_strings': sm_model_strings,
        'data': regression_model_data
        }

    return out_dict

In [10]:
def run_regressions(relevant_thread_data, subreddit_regression_data, subreddit_FSS_metrics, date_array, date_index, period_counter):
    regression_dict = get_regression_data_by_time_period(
        relevant_thread_data, subreddit_regression_data,
        subreddit_FSS_metrics,date_array, date_index, period_counter)
    model_results = {}
    param_dict = {}
    for key in regression_dict['model_strings']:
        print(f"Model {key}")
        logit_mod = smf.logit(regression_dict['model_strings'][key], data=regression_dict['data']).fit()
        model_results[key] = {}
        model_results[key]['num_features'] = key
        model_results[key]['model'] = regression_dict['model_strings'][key]
        model_results[key]['aic'] = logit_mod.aic
        model_results[key]['bic'] = logit_mod.bic
    
        param_dict[key] = pd.DataFrame(logit_mod.params).rename(columns={0: key})
        
        model_results[key]['auc'] = metrics.roc_auc_score(regression_dict['data'].success, logit_mod.predict())
        
    
    model_results = pd.DataFrame.from_dict(model_results, orient='index')

    out_dict = {
        'regression_params': param_dict,
        'metrics': model_results
    }

    return out_dict

In [11]:
def get_AIC_BIC_by_time_period(subreddit_thread_data,
                                subreddit_regression_data,
                                collection_window=params['collection window'],
                                model_window=params['model window'],
                                step=params['step']
                                ):

    # get one col for sentiment score
    subreddit_thread_data['sentiment_score'] = subreddit_thread_data.apply(get_score, axis = 1)

    # only use wanted cols
    relevant_thread_data = subreddit_thread_data[['thread_id', 'id', 'timestamp', 'author', 'sentiment_score']]
    
    # get array of dates in dataset
    date_array = relevant_thread_data.timestamp.apply(get_date).unique()

    # start time period counter (start on 1 as period 0 is collection data not modelled)
    period_counter = 1

    # get dict for info collection
    regression_metrics = {}

    # iterate through date array windows
    for date_index in range(0, len(date_array) - (collection_window + model_window), step):
        print(f"Period {period_counter}")

        regression_metrics[period_counter] = run_regressions(
            relevant_thread_data, subreddit_regression_data, subreddit_FSS_metrics, date_array, date_index, period_counter
            )
        
        # add another week to counter
        period_counter += 1
    
    return regression_metrics


In [12]:
regression_df = pickle.load(open(params['regression_infile'], 'rb'))
thread_df = pickle.load(open(params['thread_infile'], 'rb'))

In [None]:
regression_metrics = {}
for subreddit in regression_df:
    print(f"{subreddit}")
    regression_metrics[subreddit] = get_AIC_BIC_by_time_period(
        thread_df[subreddit],
        regression_df[subreddit]
        )

In [None]:
def get_x_feats_to_run(subreddit_FSS_metrics, period):
    period_metrics = subreddit_FSS_metrics[subreddit_FSS_metrics.index == period].reset_index()
    features = {}
    i=1
    for feature_tuple in period_metrics.feature_names:
        features[i] = list(feature_tuple)
        i += 1
    return features