In [1]:
# data manipulation imports
import pandas as pd
import numpy as np

# data saving imports
import pickle

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass

# stats imports
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import metrics
from mlxtend.feature_selection import SequentialFeatureSelector

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [2]:
def mnlogit_score(estimator, X, y):
    probabilities = pd.DataFrame(estimator.predict_proba(X))
    y_pred = probabilities.idxmax(axis=1)
    value_counts = (y_pred == y).value_counts()
    return value_counts.loc[True]/value_counts.sum()

def find_quantile(value, quantile_ranges):
    for i in quantile_ranges:
        if i[0] <= value <= i[1]:
            return i
        
def find_quantile_index(tuple_value):
    return quantiles.index(tuple_value)

In [3]:
data = pickle.load(open('test_data_crypto_5_days.p', 'rb'))
regression_threads = data['regression_data']
all_data = data['all_data']
threads_started = regression_threads[regression_threads.thread_size > 1]

quantiles = [0.25, 0.5, 0.75]
quant_values = [threads_started.thread_size.min()]
for i in quantiles:
    quant_values.append(threads_started.thread_size.quantile(q=i))
quant_values.append(threads_started.thread_size.max())

quantiles = []
for i in range(len(quant_values)-1):
    lower = quant_values[i]
    upper = quant_values[i+1]
    #if i != 0:
        #lower += 1
    if i+1 != len(quant_values) - 1:
        upper -= 1
    num_values = threads_started[(threads_started.thread_size <= upper) & (threads_started.thread_size >= lower)]
    print(f"{i}: [{lower}, {upper}] {len(num_values)}")
    quantiles.append((int(lower), int(upper)))

threads_started['thread_size_bin'] = threads_started.thread_size.apply(find_quantile, quantile_ranges = quantiles)
threads_started['thread_size_bin_index'] = threads_started.thread_size_bin.apply(find_quantile_index)
started_threads_all_data = all_data[all_data.thread_id.isin(threads_started.thread_id)]



0: [2, 7.0] 287
1: [8.0, 18.0] 286
2: [19.0, 44.0] 295
3: [45.0, 3813] 292


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  threads_started['thread_size_bin'] = threads_started.thread_size.apply(find_quantile, quantile_ranges = quantiles)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  threads_started['thread_size_bin_index'] = threads_started.thread_size_bin.apply(find_quantile_index)


In [4]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "log_author_all_activity_count",
]
y_col = "thread_size_bin_index"

regression_params = {
    'name': 'crypto',
    'regression_data': threads_started,
    'thread_data': started_threads_all_data,
    'regression_type': 'mnlogit',
    'collection_window': 2,
    'model_window': 2,
    'validation_window': 1,
    'FSS': True,
    'performance_scoring_method': "mnlogit",
    'x_cols': X_COLS,
    'y_col': y_col,
    'metrics': ['roc_auc'],
    'activity_threshold': 2,
}

In [5]:
test_mnlogit = RR(regression_params)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_params["thread_data"]["sentiment_score"] = regression_params[


In [6]:

test_mnlogit.calc_author_thread_counts()

In [7]:
test_mnlogit.get_regression_model_data()

In [8]:
test_mnlogit.get_regression_model_data(calval="val")

In [9]:
x_data = test_mnlogit.regression_model_data[X_COLS]
y_data = test_mnlogit.regression_model_data[y_col]
model = linear_model.LogisticRegression(multi_class='multinomial')
max_k = len(x_data.columns)
k = (1, max_k)

In [10]:
skl_mod = linear_model.LogisticRegression(multi_class='multinomial').fit(x_data, y_data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
probs = pd.DataFrame(skl_mod.predict_proba(x_data))
probs.idxmax(axis=1)

0      1
1      3
2      3
3      3
4      3
      ..
282    1
283    1
284    1
285    2
286    2
Length: 287, dtype: int64

In [12]:
pd.DataFrame(probs)

Unnamed: 0,0,1,2,3
0,0.187748,0.347647,0.117518,0.347086
1,0.230426,0.182765,0.248376,0.338434
2,0.213636,0.066865,0.229010,0.490490
3,0.190339,0.135383,0.289508,0.384770
4,0.163977,0.066818,0.348871,0.420334
...,...,...,...,...
282,0.125453,0.485911,0.242080,0.146557
283,0.098125,0.634922,0.144710,0.122242
284,0.101533,0.586906,0.184710,0.126851
285,0.085675,0.346288,0.411456,0.156581


In [13]:
sfs = SequentialFeatureSelector(model, k_features=k, forward=True, scoring=mnlogit_score)

In [14]:
selected_features = sfs.fit(x_data, y_data.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
metric_df = pd.DataFrame.from_dict(
            selected_features.get_metric_dict(), orient="index"
        )

In [16]:
metric_df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(4,)","[0.3620689655172414, 0.4482758620689655, 0.403...",0.414701,"(activity_ratio,)",0.044021,0.03425,0.017125
2,"(0, 4)","[0.3620689655172414, 0.4482758620689655, 0.403...",0.414701,"(sentiment_sign, activity_ratio)",0.044021,0.03425,0.017125
3,"(0, 4, 6)","[0.3620689655172414, 0.4482758620689655, 0.403...",0.411192,"(sentiment_sign, activity_ratio, mean_author_s...",0.046356,0.036067,0.018033
4,"(0, 3, 4, 6)","[0.3620689655172414, 0.41379310344827586, 0.40...",0.407804,"(sentiment_sign, num_dayofweek, activity_ratio...",0.047964,0.037318,0.018659
5,"(0, 3, 4, 6, 7)","[0.3620689655172414, 0.43103448275862066, 0.38...",0.400726,"(sentiment_sign, num_dayofweek, activity_ratio...",0.071246,0.055432,0.027716
6,"(0, 1, 3, 4, 6, 7)","[0.3793103448275862, 0.3620689655172414, 0.368...",0.386872,"(sentiment_sign, sentiment_magnitude, num_dayo...",0.045082,0.035075,0.017538
7,"(0, 1, 3, 4, 5, 6, 7)","[0.3793103448275862, 0.3448275862068966, 0.368...",0.376407,"(sentiment_sign, sentiment_magnitude, num_dayo...",0.055379,0.043086,0.021543
8,"(0, 1, 2, 3, 4, 5, 6, 7)","[0.3103448275862069, 0.3620689655172414, 0.175...",0.320448,"(sentiment_sign, sentiment_magnitude, hour, nu...",0.11033,0.08584,0.04292


In [17]:
from sklearn import preprocessing

In [18]:
scaler = preprocessing.StandardScaler().fit(x_data)

In [19]:
scaler.mean_

array([ 0.06968641,  0.20360844, 12.89547038,  0.50174216,  0.66187927,
        0.38675958,  0.09739325,  2.8443881 ])

In [20]:
scaler.scale_

array([0.77550988, 0.22165123, 5.92074959, 0.49999696, 0.49299055,
       0.910775  , 0.0821311 , 1.1966896 ])

In [21]:
x_scaled = scaler.transform(x_data)

In [22]:
x_scaled.mean(axis=0)

array([-3.09469833e-18, -1.05219743e-16, -9.90303464e-17,  2.97091039e-16,
        6.03466174e-17,  9.28409498e-18, -1.17598536e-16,  1.88776598e-16])

In [23]:
x_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1.])

In [25]:
test_mnlogit.regression_model_data

Unnamed: 0,thread_id,thread_size,authors,timestamp,author,score,subject_sentiment_score,sentiment_sign,sentiment_magnitude,success,thread_size_bin,thread_size_bin_index,activity_ratio,log_author_all_activity_count,mean_author_sentiment_sign,mean_author_sentiment_magnitude,num_dayofweek,hour
0,xu3vcu,2745,519,2022-10-03 00:00:10,8ac426fd80f0ea3761bdcd7f32591b09ce1c1366e59ef15c,1,0.0000,0.0,0.0000,1,"(45, 3813)",3,-1.000000,1.098612,0.0,0.000000,0,0
2,xu5c0j,2,2,2022-10-03 01:09:45,46831fba4c613ab7822de473fe94e34b3ce758b1ac1138a5,1,-0.5574,-1.0,0.5574,1,"(2, 7)",0,0.333333,1.945910,-1.0,0.066167,0,1
3,xu5i3m,346,193,2022-10-03 01:17:32,c542a05eb22ffca30f65e2cf5772dcb0e196f02614fd1193,1,0.5719,1.0,0.5719,1,"(45, 3813)",3,1.000000,4.499810,1.0,0.003850,0,1
4,xu5pwv,134,81,2022-10-03 01:27:51,a1108b7c47e9c7227007ffec746ba5b8506514d18c9e2bbb,1,0.0000,0.0,0.0000,1,"(45, 3813)",3,0.555556,2.302585,1.0,0.213872,0,1
5,xu64qp,3,3,2022-10-03 01:47:45,53b0c05f4ead4193ec3ae4223ccc9e34c48453733769703c,1,0.0000,0.0,0.0000,1,"(2, 7)",0,1.000000,4.382027,1.0,0.052669,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,xvs0gh,13,13,2022-10-04 21:58:36,e0c497e66ac028c5ac161bb04db35791c7dec3fa0530d66e,1,0.3818,1.0,0.3818,1,"(8, 18)",1,0.500000,1.609438,1.0,0.225475,1,21
477,xvs2cs,16,15,2022-10-04 22:00:33,5e306417ae4e1bf0f1f89a9cefdcb11cf27539d545344e15,1,0.4939,1.0,0.4939,1,"(8, 18)",1,0.000000,1.098612,1.0,0.220200,1,22
479,xvs7fg,11,11,2022-10-04 22:06:21,33a56d77b0eb7cd989ec54a751314bc1f2474b535ca30e38,1,0.1779,1.0,0.1779,1,"(8, 18)",1,0.000000,1.609438,1.0,0.110100,1,22
487,xvts3n,6,6,2022-10-04 23:14:09,8a69e054671a755129658fef3eb595917fc712be55097460,1,0.0000,0.0,0.0000,1,"(2, 7)",0,0.700000,3.713572,1.0,0.113473,1,23


In [29]:
x_scaled = pd.DataFrame(x_scaled, columns=x_data.columns, index=x_data.index)

In [30]:
pd.concat((test_mnlogit.regression_model_data[[y_col]],x_scaled), axis=1)

Unnamed: 0,thread_size_bin_index,sentiment_sign,sentiment_magnitude,hour,num_dayofweek,activity_ratio,mean_author_sentiment_sign,mean_author_sentiment_magnitude,log_author_all_activity_count
0,3,-0.089859,-0.918598,-2.178013,-1.003490,-3.371016,-0.424649,-1.185827,-1.458838
2,0,-1.379333,1.596163,-2.009116,-1.003490,-0.666435,-1.522615,-0.380204,-0.750803
3,3,1.199615,1.661581,-2.009116,-1.003490,0.685856,0.673317,-1.138955,1.383334
4,3,-0.089859,-0.918598,-2.009116,-1.003490,-0.215671,0.673317,1.418208,-0.452751
5,0,-0.089859,-0.918598,-2.009116,-1.003490,0.685856,0.673317,-0.544553,1.284910
...,...,...,...,...,...,...,...,...,...
475,1,1.199615,0.803928,1.368835,0.996522,-0.328362,0.673317,1.559479,-1.031972
477,1,1.199615,1.309677,1.537733,0.996522,-1.342580,0.673317,1.495253,-1.458838
479,1,1.199615,-0.115986,1.537733,0.996522,-1.342580,0.673317,0.154713,-1.031972
487,0,-0.089859,-0.918598,1.706630,0.996522,0.077325,0.673317,0.195786,0.726324


In [52]:
selected_features = sfs.fit(x_scaled, y_data.ravel())

In [53]:
metric_df = pd.DataFrame.from_dict(
            selected_features.get_metric_dict(), orient="index"
        )

In [54]:
metric_df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(4,)","[0.3620689655172414, 0.4482758620689655, 0.403...",0.414701,"(4,)",0.044021,0.03425,0.017125
2,"(0, 4)","[0.3620689655172414, 0.4482758620689655, 0.403...",0.414701,"(0, 4)",0.044021,0.03425,0.017125
3,"(0, 3, 4)","[0.3620689655172414, 0.41379310344827586, 0.40...",0.407804,"(0, 3, 4)",0.047964,0.037318,0.018659
4,"(0, 3, 4, 6)","[0.3620689655172414, 0.41379310344827586, 0.40...",0.411313,"(0, 3, 4, 6)",0.056086,0.043637,0.021819
5,"(0, 3, 4, 6, 7)","[0.3620689655172414, 0.43103448275862066, 0.38...",0.404235,"(0, 3, 4, 6, 7)",0.066151,0.051468,0.025734
6,"(0, 1, 3, 4, 6, 7)","[0.3275862068965517, 0.3620689655172414, 0.333...",0.376528,"(0, 1, 3, 4, 6, 7)",0.076455,0.059484,0.029742
7,"(0, 1, 3, 4, 5, 6, 7)","[0.3620689655172414, 0.3448275862068966, 0.350...",0.376467,"(0, 1, 3, 4, 5, 6, 7)",0.062885,0.048927,0.024463
8,"(0, 1, 2, 3, 4, 5, 6, 7)","[0.3103448275862069, 0.3448275862068966, 0.175...",0.327526,"(0, 1, 2, 3, 4, 5, 6, 7)",0.116121,0.090346,0.045173


In [55]:
from sklearn.pipeline import make_pipeline

In [63]:
pipe = make_pipeline(preprocessing.StandardScaler(), sfs)

In [65]:
out = pipe.fit(x_data, y_data.ravel())

In [69]:
out.get_params()

{'memory': None,
 'steps': [('standardscaler', StandardScaler()),
  ('sequentialfeatureselector',
   SequentialFeatureSelector(estimator=LogisticRegression(multi_class='multinomial'),
                             k_features=(1, 8),
                             scoring=<function mnlogit_score at 0x000001CF5D404940>))],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'sequentialfeatureselector': SequentialFeatureSelector(estimator=LogisticRegression(multi_class='multinomial'),
                           k_features=(1, 8),
                           scoring=<function mnlogit_score at 0x000001CF5D404940>),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'sequentialfeatureselector__clone_estimator': True,
 'sequentialfeatureselector__cv': 5,
 'sequentialfeatureselector__estimator__C': 1.0,
 'sequentialfeatureselector__estimator__class_weight': None,
 'sequentialfeatureselector__estimator__dual': False,
 'sequentialfeatureselect

In [71]:
'name' in regression_params

True

In [72]:
 windows = ["collection_window", "model_window", "validation_window"]

In [74]:
regression_params.keys()

dict_keys(['name', 'regression_data', 'thread_data', 'regression_type', 'collection_window', 'model_window', 'validation_window', 'FSS', 'performance_scoring_method', 'x_cols', 'y_col', 'metrics', 'activity_threshold'])

In [78]:
(set(windows) & set(regression_params.keys()))

{'collection_window', 'model_window', 'validation_window'}

In [90]:
regression_params[(set(windows) - set(["model_window"]))]

TypeError: unhashable type: 'set'

In [89]:
if (set(windows) - {'collection_window', 'model_window', 'validation_window', 'buff'}):
    print('True')

In [92]:
a = [1,2,3,4]
sum(a)

10

In [95]:
np.prod(a, 10)

AxisError: axis 10 is out of bounds for array of dimension 1

In [45]:
test_mnlogit.regression_model_data.columns

Index(['thread_id', 'thread_size', 'authors', 'timestamp', 'author', 'score',
       'subject_sentiment_score', 'sentiment_sign', 'sentiment_magnitude',
       'success', 'thread_size_bin', 'thread_size_bin_index', 'activity_ratio',
       'log_author_all_activity_count', 'mean_author_sentiment_sign',
       'mean_author_sentiment_magnitude', 'num_dayofweek', 'hour'],
      dtype='object')

In [47]:
smf_mnlogit = smf.mnlogit("thread_size_bin_index ~ mean_author_sentiment_sign", data = test_mnlogit.regression_model_data).fit()

Optimization terminated successfully.
         Current function value: 1.341604
         Iterations 5


In [49]:
smf_mnlogit.predict()

array([[0.16519673, 0.26379348, 0.35448191, 0.21652788],
       [0.18828717, 0.28480729, 0.35219476, 0.17471078],
       [0.14287964, 0.2408603 , 0.35171711, 0.26454294],
       ...,
       [0.14287964, 0.2408603 , 0.35171711, 0.26454294],
       [0.14287964, 0.2408603 , 0.35171711, 0.26454294],
       [0.14287964, 0.2408603 , 0.35171711, 0.26454294]])