In [106]:
import pandas as pd
import numpy as np

In [107]:
# 4131 patients admitted to hospital without shock
# Features are those collected over the initial 48 hours of admission


df = pd.read_excel(r'./OUCRU_dengue_shock.xlsx')

In [108]:
'''
Data dictionary:
---
day_of_illness: day of illness/fever onset starting from day 0
age: age in years
sex: 1=male, 0=female
weight: weight in kg
hctmin/median/max: haematocrit % summarised over first 48 hours of hospital admission
pltmin/median/max: platelet count x 10^6.L
Shock: dengue shock syndrome (WHO 2009 definitions)
'''

X_cat = ['sex']
X_num = ['day_of_illness',
            'hctmedian',
            'hctmax',
            'hctmin',
            'pltmax',
            'pltmedian',
            'pltmin',
            'age',
            'weight'           
            ]

In [109]:
X = df.iloc[:,:-2]
y = df.iloc[:,-2]

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

categorical_features =  X_cat
categorical_transformer = Pipeline([
    ('imputer_cat', SimpleImputer(strategy = 'most_frequent', fill_value = 'missing'))
])

numeric_features = X_num
numeric_transformer = Pipeline([
        ('imputer_num', SimpleImputer()),
        ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
        ('categoricals', categorical_transformer, categorical_features),
        ('numericals', numeric_transformer, numeric_features)], 
        remainder = 'drop')

In [111]:
X_transformed = preprocessor.fit_transform(X)
X_transformed = pd.DataFrame(X_transformed)
X_transformed.columns = X.columns

In [112]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [113]:
#Optimised model parameters

xgb = XGBClassifier(eta=0.01,gamma=0.1,max_depth= 4,min_child_weight=0.005, n_estimators= 250, eval_metric='logloss')

ann = MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
            beta_2=0.999, early_stopping=False, epsilon=1e-08,
            hidden_layer_sizes=(100, 100), learning_rate='constant',
            learning_rate_init=0.001, max_fun=15000, max_iter=50,
            momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
            power_t=0.5, random_state=None, shuffle=True, solver='adam',
            tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)

rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                    criterion='gini', max_depth=5, max_features='auto',
                    max_leaf_nodes=None, max_samples=None,
                    min_impurity_decrease=0.0, 
                    min_samples_leaf=1, min_samples_split=3,
                    min_weight_fraction_leaf=0.0, n_estimators=500,
                    n_jobs=None, oob_score=False, random_state=42, verbose=0,
                    warm_start=False)

models = [(xgb,'xgboost'),(ann,'neural networks'),(rfc,'random forest')]

In [114]:
#Performance over 10-fold CV

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)

auc = []

for i,j in models:
    scores = cross_validate(i,X_transformed,y,scoring='roc_auc',cv=cv)
    results = pd.DataFrame(scores)['test_score']
    auc.append(results)

results = pd.DataFrame(auc).T
results.columns = [j for i,j in models]

display(results)



Unnamed: 0,xgboost,neural networks,random forest
0,0.786111,0.774491,0.791949
1,0.760479,0.734114,0.724415
2,0.892002,0.906882,0.894908
3,0.850616,0.875262,0.857359
4,0.874332,0.852476,0.886771
5,0.812253,0.835968,0.810044
6,0.769647,0.839688,0.835038
7,0.756801,0.757382,0.733899
8,0.872762,0.857824,0.85143
9,0.819926,0.887236,0.882237


In [115]:
XGBoost = xgb.fit(X_transformed,y)
ANN = ann.fit(X_transformed,y)
RandomF = rfc.fit(X_transformed,y)



In [116]:
import pickle

pkl_filename = 'xgb.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(XGBoost, file)

pkl_filename = 'ann.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(ANN, file)

pkl_filename = 'rfc.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(RandomF, file)