In [113]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from scipy.stats import pearsonr
import numpy as np

In [151]:
FILE_PATH = '/data/avirinchipur/EMI/datadicts/ds4ud_drinks_ans_avg_words_roba64_todaydrinks_rangel3u14_10folds.pkl'

In [152]:
dataDict = pickle.load(open(FILE_PATH, 'rb'))

In [153]:
print (dataDict.keys())
print (dataDict['train_data'].keys())
print (len(dataDict['train_data']['embeddings']))
print (len(dataDict['test_data']['embeddings']))

dict_keys(['train_data', 'val_data', 'test_data'])
dict_keys(['seq_idx', 'time_ids', 'embeddings', 'labels', 'query_ids', 'folds'])
602
207


In [154]:
def extract_embs_drinks_label(embeddings, labels, seq_idx, time_ids, query_ids=None, folds=None):
    op_seq_ids, op_folds = [], []
    op_embeddings, op_drinks, op_labels, op_day_ids = [], [], [], []
    for example_idx in range(len(embeddings)):
        op_seq_ids.append(seq_idx[example_idx])
        op_day_ids.append(time_ids[example_idx])
        op_embeddings.append(np.vstack(np.array(embs[example_idx])[0, :, 0]))
        op_drinks.append(np.vstack(np.array(embs[example_idx])[0, :, 1]))
        op_labels.append(labels[example_idx][0])
        if folds: op_folds.append(folds[example_idx])
    
    if folds: return op_seq_ids, op_embeddings, op_drinks, op_labels, op_day_ids, op_folds
    return op_seq_ids, op_embeddings, op_drinks, op_labels, op_day_ids

In [155]:
seq_id_tr, embs_tr, drinks_tr, labels_tr, day_ids_tr, folds_tr = extract_embs_drinks_label(**dataDict['train_data'])
print (len(seq_id_tr), len(embs_tr), len(drinks_tr), len(labels_tr), len(day_ids_tr), len(folds_tr))

  op_embeddings.append(np.vstack(np.array(embs[example_idx])[0, :, 0]))
  op_drinks.append(np.vstack(np.array(embs[example_idx])[0, :, 1]))


602 602 602 602 602 602


In [156]:
seq_id_te, embs_te, drinks_te, labels_te, day_ids_te = extract_embs_drinks_label(**dataDict['test_data'])
print (len(seq_id_te), len(embs_te), len(drinks_te), len(labels_te), len(day_ids_te))

  op_embeddings.append(np.vstack(np.array(embs[example_idx])[0, :, 0]))
  op_drinks.append(np.vstack(np.array(embs[example_idx])[0, :, 1]))


207 207 207 207 207


In [157]:
print ("SHAPES")
print ("-------------------")
print ("Train shapes (X/y): {}/{}".format(X_tr.shape, y_tr.shape))
print ("Test shapes (X/y): {}/{}".format(X_te.shape, y_te.shape))

SHAPES
-------------------
Train shapes (X/y): (602, 768)/(602,)
Test shapes (X/y): (207, 768)/(207,)


In [158]:
def smape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + y_pred + 1e-10))) * 100

## Average Train drinks baseline

In [159]:
y_tr, y_te = np.array(labels_tr), np.array(labels_te)

In [160]:
avg_y_tr = np.mean(y_tr)
y_pred = np.ones(y_te.shape) * avg_y_tr

avg_baseline_mse = round(mean_squared_error(y_te, y_pred), 3)
avg_baseline_mae = round(mean_absolute_error(y_te, y_pred), 3)
avg_baseline_smape = round(smape(y_te, y_pred), 3)
avg_baseline_r2 = round(r2_score(y_te, y_pred), 3)
avg_baseline_corr = round(pearsonr(y_te, y_pred)[0], 3)

print ("Train Average Baseline")
print ("-------------------")
print ("MSE: {}".format(avg_baseline_mse))
print ("MAE: {}".format(avg_baseline_mae))
print ("SMAPE: {}".format(avg_baseline_smape))
print ("R2: {}".format(avg_baseline_r2))
print ("Correlation: {}".format(avg_baseline_corr))

Train Average Baseline
-------------------
MSE: 0.369
MAE: 0.491
SMAPE: 20.147
R2: -0.005
Correlation: nan




## Average embeddings of messages predicting avg number of drinks of a wave

In [161]:
emb_agg_fn = lambda embs: list(map(lambda z: np.mean(z, axis=0), embs))
X_tr = np.array(emb_agg_fn(embs_tr))
X_te = np.array(emb_agg_fn(embs_te))

In [162]:
## Std Scaling
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

In [163]:
np.unique(folds_tr)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [164]:
## Hyperaparameter tuning of ridge alpha using 10 fold cv on train data
alphas = [1, 10, 100, 1000, 10000]
folds=[]
for i in range(len(set(folds_tr))):
    tr_folds_idx = np.argwhere(np.array(folds_tr) != i).flatten()
    te_folds_idx = np.argwhere(np.array(folds_tr) == i).flatten()
    folds.append((tr_folds_idx, te_folds_idx))

parameters = {'alpha': alphas}
ridge = Ridge()
cv_model = GridSearchCV(ridge, parameters, cv=folds, scoring='neg_mean_squared_error')
cv_model.fit(X_tr, y_tr)

In [165]:
cv_model.best_score_, cv_model.best_params_

(-0.4428745682503311, {'alpha': 10000})

In [166]:
cv_model.cv_results_

{'mean_fit_time': array([0.18206921, 0.16528349, 0.15390224, 0.17432692, 0.15933332]),
 'std_fit_time': array([0.04964804, 0.04167398, 0.04336594, 0.02325151, 0.03024906]),
 'mean_score_time': array([0.00804152, 0.01112082, 0.00737715, 0.00973001, 0.00496943]),
 'std_score_time': array([0.00628542, 0.00976563, 0.00672941, 0.009526  , 0.00611655]),
 'param_alpha': masked_array(data=[1, 10, 100, 1000, 10000],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 1},
  {'alpha': 10},
  {'alpha': 100},
  {'alpha': 1000},
  {'alpha': 10000}],
 'split0_test_score': array([-1.79279342, -0.82658292, -0.52865746, -0.43302921, -0.41381746]),
 'split1_test_score': array([-2.29103902, -0.98993721, -0.64185872, -0.59299385, -0.54972117]),
 'split2_test_score': array([-1.88359136, -0.86839133, -0.55131728, -0.46499043, -0.46258695]),
 'split3_test_score': array([-1.4334305 , -0.7130085 , -0.44981228, -0.33484401, -0.28802444

In [167]:
model = Ridge(alpha=cv_model.best_params_['alpha'])
model.fit(X_tr, y_tr)
y_pred = model.predict(X_te)

ridge_mse = round(mean_squared_error(y_te, y_pred), 3)
ridge_mae = round(mean_absolute_error(y_te, y_pred), 3)
ridge_smape = round(smape(y_te, y_pred), 3)
ridge_r2 = round(r2_score(y_te, y_pred), 3)
ridge_pearson_r = round(pearsonr(y_te, y_pred)[0], 3)

print ("Ridge Regression")
print ("-------------------")
print ("MSE: {}".format(ridge_mse))
print ("MAE: {}".format(ridge_mae))
print ("SMAPE: {}".format(ridge_smape))
print ("R2: {}".format(ridge_r2))
print ("Pearson R: {}".format(ridge_pearson_r))

Ridge Regression
-------------------
MSE: 0.378
MAE: 0.496
SMAPE: 20.312
R2: -0.028
Pearson R: -0.059


## Moving average of drinks predicting avg number of drinks of a wave