In [216]:
import sys; sys.path.insert(0, '../../')
from definitions import *
from src.data.dataset import TimeSeriesDataset
import torch
from src.data import dicts
from src.features.signatures.compute import RollingSignature
from src.features.rolling import RollingStatistic
from src.data.functions import torch_ffill
from src.features.derived_features import shock_index, partial_sofa, bun_cr

In [217]:
# Load the dataset
dataset = TimeSeriesDataset().load(DATA_DIR + '/raw/data.tsd')

In [218]:
dataset.size()

torch.Size([6273, 336, 41])

In [219]:

# First get counts of the laboratory values
count_variables = dicts.feature_types['laboratory'] + ['Temp']
counts = RollingStatistic(statistic='count', window_length=8).transform(dataset[count_variables])
dataset.add_features(counts)

<function RollingStatistic.transform at 0x0000024483532950> 
  996.01 ms


In [220]:
# Apply a forward fill
dataset.data = torch_ffill(dataset.data)

In [221]:
# Add on some additional features
dataset['ShockIndex'] = shock_index(dataset)
dataset['PartialSOFA'] = partial_sofa(dataset)
dataset['BUN/CR'] = bun_cr(dataset)

In [222]:
# Now moments
changing_vars = dicts.feature_types['vitals']
dataset.add_features(RollingStatistic(statistic='moments', window_length=7).transform(dataset[changing_vars]))

<function RollingStatistic.transform at 0x0000024483532950> 
  2610.22 ms


In [223]:
# Now generate some rolling window features
max_vals = RollingStatistic(statistic='max', window_length=6).transform(dataset[dicts.feature_types['vitals']])
min_vals = RollingStatistic(statistic='min', window_length=6).transform(dataset[dicts.feature_types['vitals']])
dataset.add_features(torch.cat((max_vals, min_vals), 2))

<function RollingStatistic.transform at 0x0000024483532950> 
  355.46 ms
<function RollingStatistic.transform at 0x0000024483532950> 
  367.46 ms


In [224]:
# Now some rolling signatures
roller = RollingSignature(window=7, depth=3, aug_list=['leadlag'], logsig=True)
for vbl in ['BUN/CR', 'PartialSOFA', 'MAP', 'HR', 'SBP']:
    signatures = roller.transform(dataset[vbl])
    dataset.add_features(signatures)

<function RollingSignature.transform at 0x00000244835322F0> 
  1031.78 ms
<function RollingSignature.transform at 0x00000244835322F0> 
  830.37 ms
<function RollingSignature.transform at 0x00000244835322F0> 
  837.96 ms
<function RollingSignature.transform at 0x00000244835322F0> 
  901.65 ms
<function RollingSignature.transform at 0x00000244835322F0> 
  1018.18 ms


In [225]:
# Extract machine learning data
data = dataset.to_ml()

In [226]:
labels = load_pickle(DATA_DIR + '/processed/labels/utility_scores.pickle')

In [227]:
X = data[121669:]
y = labels[121669:]

In [228]:
# Setup cv
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
#cv, cv_id = stratified_kfold_cv(X, y, n_splits=5, seed=5)
# Choose cross val method
#cv = list(StratifiedKFold(n_splits=5,random_state=1, shuffle=True).split(X, y))
# Choose cross val method
cv = list(KFold(5).split(X))

In [229]:
'''
cv = []
for i, fold in enumerate(list(id_cv)):
    train_idxs = [id_idxs[i] for i in fold[0]]
    test_idxs = [id_idxs[i] for i in fold[1]]

    if not return_as_list:
        train_idxs = np.concatenate([id_idxs[i] for i in fold[0]])
        test_idxs = np.concatenate([id_idxs[i] for i in fold[1]])

    cv.append([train_idxs, test_idxs])
'''

'\ncv = []\nfor i, fold in enumerate(list(id_cv)):\n    train_idxs = [id_idxs[i] for i in fold[0]]\n    test_idxs = [id_idxs[i] for i in fold[1]]\n\n    if not return_as_list:\n        train_idxs = np.concatenate([id_idxs[i] for i in fold[0]])\n        test_idxs = np.concatenate([id_idxs[i] for i in fold[1]])\n\n    cv.append([train_idxs, test_idxs])\n'

In [230]:
# Regressor
from lightgbm import LGBMRegressor
print('Training model...')
#clf = LGBMRegressor().set_params(**lgbm_params)
clf = LGBMRegressor()


Training model...


In [231]:
print("Training...")
clf.fit(X,y)

Training...


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [232]:
scores = load_pickle(DATA_DIR + '/processed/labels/full_scores.pickle').values

In [233]:
len(scores[121669:])

120492

In [234]:
from src.model.optimizer import CVThresholdOptimizer
import numpy as np
predictions = cross_val_predict(clf, X, y, cv=cv, n_jobs=-1)
# Evaluation
print('Thresholding...')
score = CVThresholdOptimizer(y, predictions, scores= scores[121669:]).optimize(cv, parallel=True)

print('Average: {:.3f}'.format(np.mean(score)))

Thresholding...
Average: 0.268


In [235]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error (y, predictions)
print('Train loss: {:.3f}'.format(MSE))

Train loss: 0.043


In [236]:
len(predictions)

120492

In [237]:
scores[121669:]

array([[ 0.  , -0.05, -0.05],
       [ 0.  , -0.05, -0.05],
       [ 0.  , -0.05, -0.05],
       ...,
       [ 0.  , -0.05, -0.05],
       [ 0.  , -0.05, -0.05],
       [ 0.  , -0.05, -0.05]])

In [238]:
len(scores[121669:])==len(predictions)

True

In [239]:
from src.model.optimizer import optimize_utility_threshold, compute_utility_from_indexes
predictions = torch.from_numpy(predictions)
tfm_np = lambda x: x.cpu().numpy()
predictions = tfm_np(predictions)
len(predictions)

thresh = optimize_utility_threshold(predictions, scores= scores[121669:])


In [240]:
train_utility = compute_utility_from_indexes(predictions, thresh, scores = scores[121669:])
print('Train utility score: {:.3f}'.format(train_utility))

Train utility score: 0.282


In [241]:
expected_y  = labels[:120491]
predicted_y = clf.predict(data[:120491])



In [242]:
MSE = mean_squared_error (expected_y, predicted_y)
print('Test loss: {:.3f}'.format(MSE))

Test loss: 0.100


In [245]:
predictions1 = torch.from_numpy(predicted_y)
tfm_np = lambda x: x.cpu().numpy()
predictions1 = tfm_np(predictions1)
thresh = optimize_utility_threshold(predictions, scores= scores[:120492])

In [246]:
train_utility = compute_utility_from_indexes(predictions1, thresh, scores = scores[:120491])
print('Test utility score: {:.3f}'.format(train_utility))

Test utility score: 0.186
