In [67]:
# data manipulation imports
import pandas as pd
import numpy as np

# data saving imports
import pickle

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass

# stats imports
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import metrics

In [2]:
data = pickle.load(open('test_data_crypto_5_days.p', 'rb'))

In [3]:
regression_threads = data['regression_data']
all_data = data['all_data']

In [4]:
threads_not_started = regression_threads[regression_threads.thread_size == 1]

In [5]:
threads_started = regression_threads[regression_threads.thread_size > 1]

In [6]:
quantiles = [0.25, 0.5, 0.75]
quant_values = [threads_started.thread_size.min()]
for i in quantiles:
    quant_values.append(threads_started.thread_size.quantile(q=i))
quant_values.append(threads_started.thread_size.max())

In [7]:
def get_vals_in_range(series, upper, lower):
    return series[(series <= upper) & (series >= lower)]

In [8]:
quantiles = []
for i in range(len(quant_values)-1):
    lower = quant_values[i]
    upper = quant_values[i+1]
    #if i != 0:
        #lower += 1
    if i+1 != len(quant_values) - 1:
        upper -= 1
    num_values = threads_started[(threads_started.thread_size <= upper) & (threads_started.thread_size >= lower)]
    print(f"{i}: [{lower}, {upper}] {len(num_values)}")
    quantiles.append((int(lower), int(upper)))

0: [2, 7.0] 287
1: [8.0, 18.0] 286
2: [19.0, 44.0] 295
3: [45.0, 3813] 292


0: [2, 8.0] 315
1: [9.0, 19.0] 275
2: [20.0, 45.0] 286
3: [46.0, 3813] 284

0: [2, 7.0] 287
1: [8.0, 18.0] 286
2: [19.0, 44.0] 295
3: [45.0, 3813] 292


In [9]:
def find_quantile(value, quantile_ranges):
    for i in quantile_ranges:
        if i[0] <= value <= i[1]:
            return i

In [10]:
threads_started['thread_size_bin'] = threads_started.thread_size.apply(find_quantile, quantile_ranges = quantiles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  threads_started['thread_size_bin'] = threads_started.thread_size.apply(find_quantile, quantile_ranges = quantiles)


In [11]:
started_threads_all_data = all_data[all_data.thread_id.isin(threads_started.thread_id)]

In [12]:
threads_started.columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'thread_size_bin'],
      dtype='object')

In [13]:
started_threads_all_data.columns

Index(['thread_id', 'id', 'timestamp', 'author', 'domain', 'parent', 'score',
       'subject_sentiment_score', 'body_sentiment_score', 'date', 'level',
       'parent_comment'],
      dtype='object')

In [14]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "log_author_all_activity_count",
]

In [15]:
regression_params = {
    'name': 'crypto',
    'regression_data': threads_started,
    'thread_data': started_threads_all_data,
    'regression_type': 'mnlogit',
    'collection_window': 2,
    'model_window': 2,
    'validation_window': 1,
    'FSS': True,
    'performance_scoring_method': 'roc_auc',
    'x_cols': X_COLS,
    'y_col': 'thread_size_bin',
    'metrics': 'AUC',
    'activity_threshold': 2,
}

In [16]:
test_regression = RR(regression_params)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_params["thread_data"]["sentiment_score"] = regression_params[


In [17]:
test_regression.calc_author_thread_counts()

In [34]:
test_regression.period_counter = 1

date_index = 0

#print(f"# Period {test_regression.period_counter} #")
if "activity_threshold" in test_regression.regression_params:
    test_regression.removed_threads[test_regression.period_counter] = {}
test_regression.model_data[test_regression.period_counter] = {}

# get model data for this period
test_regression.get_regression_model_data(date_index)

# get validation data for this period if validation
if "validation_window" in test_regression.regression_params:
    test_regression.get_regression_model_data(date_index, calval="val")

In [37]:
test_regression.regression_model_data.columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'thread_size_bin', 'activity_ratio',
       'log_author_all_activity_count', 'mean_author_sentiment_sign',
       'mean_author_sentiment_magnitude', 'num_dayofweek', 'hour'],
      dtype='object')

In [40]:
test_regression.sm_modstrings = {
    0: "thread_size_bin ~ log_author_all_activity_count",
    1: "thread_size_bin ~ activity_ratio",
    2: "thread_size_bin ~ mean_author_sentiment_sign",
    3: "thread_size_bin ~ mean_author_sentiment_magnitude",
    4:"thread_size_bin ~ num_dayofweek",
    5:"thread_size_bin ~ sentiment_sign",
    6:"thread_size_bin ~ sentiment_magnitude",
    7:"thread_size_bin ~ hour",
}

In [41]:
for mod_key in test_regression.sm_modstrings:
    print(f"Model {mod_key}")
    regression_out_dict = test_regression.run_regression(mod_key)
    model_results[mod_key] = regression_out_dict["model_metrics"]
    param_dict[mod_key] = regression_out_dict["regression_params"]

Model 0


ValueError: endog has evaluated to an array with multiple columns that has shape (287, 4). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).

In [None]:
linear_mode

In [None]:
linear_model.LinearRegression()

In [51]:
help(linear_model.LinearRegression)

Help on class LinearRegression in module sklearn.linear_model._base:

class LinearRegression(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, LinearModel)
 |  LinearRegression(*, fit_intercept=True, copy_X=True, n_jobs=None, positive=False)
 |  
 |  Ordinary least squares Linear Regression.
 |  
 |  LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
 |  to minimize the residual sum of squares between the observed targets in
 |  the dataset, and the targets predicted by the linear approximation.
 |  
 |  Parameters
 |  ----------
 |  fit_intercept : bool, default=True
 |      Whether to calculate the intercept for this model. If set
 |      to False, no intercept will be used in calculations
 |      (i.e. data is expected to be centered).
 |  
 |  copy_X : bool, default=True
 |      If True, X will be copied; else, it may be overwritten.
 |  
 |  n_jobs : int, default=None
 |      The number of jobs to use for the computation. This will only provide
 |  

In [53]:
help(smf.mnlogit)

Help on method from_formula in module statsmodels.base.model:

from_formula(formula, data, subset=None, drop_cols=None, *args, **kwargs) method of builtins.type instance
    Create a Model from a formula and dataframe.
    
    Parameters
    ----------
    formula : str or generic Formula object
        The formula specifying the model.
    data : array_like
        The data for the model. See Notes.
    subset : array_like
        An array-like object of booleans, integers, or index values that
        indicate the subset of df to use in the model. Assumes df is a
        `pandas.DataFrame`.
    drop_cols : array_like
        Columns to drop from the design matrix.  Cannot be used to
        drop terms involving categoricals.
    *args
        Additional positional argument that are passed to the model.
    **kwargs
        These are passed to the model with one exception. The
        ``eval_env`` keyword is passed to patsy. It can be either a
        :class:`patsy:patsy.EvalEnvironm

In [93]:
smf.mnlogit("C(thread_size_bin) ~ log_author_all_activity_count", data=test_regression.regression_model_data[['thread_size_bin', 'log_author_all_activity_count']])

ValueError: endog has evaluated to an array with multiple columns that has shape (287, 4). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).

In [58]:
quantiles.index((2,7))

0

In [60]:
def find_quantile(tuple_value):
    return quantiles.index(tuple_value)

In [59]:
regression_model_data = test_regression.regression_model_data[['thread_size_bin', 'log_author_all_activity_count']]

In [61]:
regression_model_data['thread_size_quantile'] = regression_model_data.thread_size_bin.apply(find_quantile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_model_data['thread_size_quantile'] = regression_model_data.thread_size_bin.apply(find_quantile)


In [62]:
regression_model_data

Unnamed: 0,thread_size_bin,log_author_all_activity_count,thread_size_quantile
0,"(45, 3813)",1.098612,3
2,"(2, 7)",1.945910,0
3,"(45, 3813)",4.499810,3
4,"(45, 3813)",2.302585,3
5,"(2, 7)",4.382027,0
...,...,...,...
475,"(8, 18)",1.609438,1
477,"(8, 18)",1.098612,1
479,"(8, 18)",1.609438,1
487,"(2, 7)",3.713572,0


In [65]:
mnlogit_fit = smf.mnlogit("thread_size_quantile ~ log_author_all_activity_count", data=regression_model_data).fit()

Optimization terminated successfully.
         Current function value: 1.327737
         Iterations 5


In [70]:
help(mnlogit_fit)

Help on MultinomialResultsWrapper in module statsmodels.discrete.discrete_model:

<statsmodels.discrete.discrete_model.MultinomialResultsWrapper object>
    A results class for multinomial data
    
    Parameters
    ----------
    model : A DiscreteModel instance
    params : array_like
        The parameters of a fitted model.
    hessian : array_like
        The hessian of the fitted model.
    scale : float
        A scale parameter for the covariance matrix.
    
    Attributes
    ----------
    df_resid : float
        See model definition.
    df_model : float
        See model definition.
    llf : float
        Value of the loglikelihood



In [83]:
predicted_success_probabilities = pd.DataFrame(mnlogit_fit.predict())

In [81]:
def assign_success_from_quartile(value, quartile_index):
    if value == quartile_index:
        return 1
    else:
        return 0

In [84]:
def get_success_prediction_for_quartile(df, quartile_index):
    return df.loc[:, quartile_index]

In [87]:
predictions = pd.DataFrame(mnlogit_fit.predict())
success_data = []
success_prediction = []
auc_vals = []
for i in range(0,4):
    success_data.append(regression_model_data.thread_size_quantile.apply(assign_success_from_quartile, quartile_index=i))
    success_prediction.append(get_success_prediction_for_quartile(predictions, i))
    auc_vals.append(
        metrics.roc_auc_score(success_data[i], success_prediction[i])
    )

In [91]:
mean_auc = np.array(auc_vals).mean()

In [92]:
mean_auc

0.5674747194697777