In [19]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

train_data = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_features.csv')

In [17]:
def features_engineering(data, n):
    x = []
    features = [np.nanmedian, np.nanmean, np.nanvar, np.nanmin,
           np.nanmax]
    for index in range(int(data.shape[0] / n)):
        assert data[n * index, 0] == data[n * (index + 1) - 1, 0], \
        'Ids are {}, {}'.format(data[n * index, 0], data[n * (index + 1) - 1, 0])
        patient_data = data[n * index: n * (index + 1), 2:]
        feature_values = np.empty((len(features), data[:, 2:].shape[1]))
        for i, feature in enumerate(features):
            feature_values[i] = feature(patient_data, axis=0)
        x.append(feature_values.ravel())
    return np.array(x)

In [18]:
x_train = features_engineering(train_data.to_numpy(), 12)
x_test = features_engineering(test_data.to_numpy(), 12)

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  feature_values[i] = feature(patient_data, axis=0)
  feature_values[i] = feature(patient_data, axis=0)
  feature_values[i] = feature(patient_data, axis=0)


In [21]:
task1_labels = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
         'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
y_train = labels[task1_labels].to_numpy()

In [25]:
for i, label in enumerate(task1_labels):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingClassifier())
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                                cv=5,
                                scoring='roc_auc',
                                verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.927, standard deviation is 0.004


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.801, standard deviation is 0.009


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.741, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.748, standard deviation is 0.005


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.744, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   20.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.806, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.895, standard deviation is 0.005


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   19.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.830, standard deviation is 0.006


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.754, standard deviation is 0.024
Cross-validation score is 0.934, standard deviation is 0.005


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.4s finished


In [26]:
df = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})
for i, label in enumerate(subtask1_labels_ids):
    pipeline = pipeline.fit(x_train, y_train[:, i].ravel())
    print("Training score:", metrics.roc_auc_score(y_train[:, i], pipeline.predict_proba(x_train)[:, 1]))
    predictions = pipeline.predict_proba(x_test)[:, 1]
    df[label] = predictions

Training score: 0.9637626539433453
Training score: 0.9357917427840701
Training score: 0.880399619161237
Training score: 0.8652150310282765
Training score: 0.8584641225079545
Training score: 0.9060053058081553
Training score: 0.967234797636131
Training score: 0.9289863646445873
Training score: 0.9740217585922002
Training score: 0.9898372443398978


In [27]:
subtask2_labels_ids = ['LABEL_Sepsis']
y_train = labels[subtask2_labels_ids].to_numpy().ravel()

In [28]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler(),
                    HistGradientBoostingClassifier())

scores = cross_val_score(pipeline, x_train, y_train,
                            cv=5,
                            scoring='roc_auc',
                            verbose=True)
print("Cross-validation score is {score:.3f},"
      " standard deviation is {err:.3f}"
      .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.712, standard deviation is 0.033


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.5s finished


In [29]:
pipeline = pipeline.fit(x_train, y_train)
predictions = pipeline.predict_proba(x_test)[:, 1]
print("Training score:", metrics.roc_auc_score(y_train, pipeline.predict_proba(x_train)[:, 1]))
df[subtask2_labels_ids[0]] = predictions

Training score: 0.9002571291542248


In [30]:
subtask3_labels_ids = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2',
                      'LABEL_Heartrate']
y_train = labels[subtask3_labels_ids].to_numpy()

In [31]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        HistGradientBoostingRegressor(max_depth=3))
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                            cv=5,
                            scoring='r2',
                            verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.414, standard deviation is 0.007


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.615, standard deviation is 0.015


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.380, standard deviation is 0.021
Cross-validation score is 0.637, standard deviation is 0.013


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.9s finished


In [32]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingRegressor(max_depth=3))
    pipeline = pipeline.fit(x_train, y_train[:, i])
    predictions = pipeline.predict(x_test)
    print("Training score:", metrics.r2_score(y_train[:, i], pipeline.predict(x_train)))
    df[label] = predictions

Training score: 0.45203575652879213
Training score: 0.6433588163969686
Training score: 0.41574585447206946
Training score: 0.6621374142984637


In [33]:
df.to_csv('prediction.csv', index=False, float_format='%.4f')