## Importin libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(130298) 

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import auc, f1_score, recall_score, precision_score,roc_curve, mean_squared_error

In [3]:
from fancyimpute import IterativeImputer, KNN

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
def compute_metrics(Y, final_pred):
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    #print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

In [6]:
%pwd

'/Users/annachiararossi/Documents/Healthcare data/project_local'

## Datasets definition

In [7]:
outcome_name = 're.admission.within.6.months'

# take the training set
X_train = pd.read_csv('train_data_drugs.csv')
X_train.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_train = X_train[outcome_name].copy()
X_train.drop(columns = outcome_name, inplace = True)

# take the test set
X_test = pd.read_csv('test_data_drugs.csv')
X_test.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_test = X_test[outcome_name].copy()
X_test.drop(columns = outcome_name, inplace = True)


In [8]:
# now we should adjust the lists of variables 

cat_columns = ['DestinationDischarge','admission.ward','admission.way','discharge.department',
                       'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade',
                       'consciousness', 'ageCat']

ordinal_columns = ['CCI.score', 'eye.opening','verbal.response', 'movement', 'GCS']

not_continuous = cat_columns.copy()

binary_columns = ['gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  'return.to.emergency.department.within.6.months', 'diuretics',
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol_drug']

not_continuous.extend(binary_columns)
not_continuous.extend(ordinal_columns)

In [9]:
for cat in cat_columns:
    X_train[cat] = cat + '_' + X_train[cat]
    X_test[cat] = cat + '_' + X_test[cat]

In [10]:
cont_columns = [col_name for col_name in X_train.columns if col_name not in not_continuous]

In [11]:
var_to_log = ['creatinine.enzymatic.method', 'urea', 'glomerular.filtration.rate', 
              'cystatin', 'lymphocyte.count', 'neutrophil.count',
              'activated.partial.thromboplastin.time', 'prothrombin.time.ratio',
              'glutamyltranspeptidase','indirect.bilirubin','alkaline.phosphatase',
              'globulin','direct.bilirubin','cholesterol',
              'low.density.lipoprotein.cholesterol','triglyceride']

## Pipeline definition

In [12]:
def log_transf(x):
    return np.log(x)

log_transformer = FunctionTransformer(log_transf)

In [13]:
a = np.array([1,2,3,4])
np.log(a)
b = pd.DataFrame(a)
np.log(b)

Unnamed: 0,0
0,0.0
1,0.693147
2,1.098612
3,1.386294


In [14]:
full_pipeline = ColumnTransformer([
        #("log", log_transformer, var_to_log),
        ("num", StandardScaler(), cont_columns),     
        ("cat", OneHotEncoder(), cat_columns)],
        remainder = 'passthrough')

#full_pipeline = Pipeline([('stationarity', starting_pipeline)])

#('imputation',KNN())
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test) 

In [15]:
knn_imputer = KNN()
X_train_prepared = knn_imputer.fit_transform(X_train_prepared)
X_test_prepared = knn_imputer.fit_transform(X_test_prepared)

Imputing row 1/1572 with 0 missing, elapsed time: 0.743
Imputing row 101/1572 with 1 missing, elapsed time: 0.747
Imputing row 201/1572 with 1 missing, elapsed time: 0.751
Imputing row 301/1572 with 1 missing, elapsed time: 0.756
Imputing row 401/1572 with 1 missing, elapsed time: 0.760
Imputing row 501/1572 with 0 missing, elapsed time: 0.764
Imputing row 601/1572 with 0 missing, elapsed time: 0.770
Imputing row 701/1572 with 0 missing, elapsed time: 0.775
Imputing row 801/1572 with 3 missing, elapsed time: 0.779
Imputing row 901/1572 with 4 missing, elapsed time: 0.784
Imputing row 1001/1572 with 4 missing, elapsed time: 0.788
Imputing row 1101/1572 with 4 missing, elapsed time: 0.792
Imputing row 1201/1572 with 0 missing, elapsed time: 0.796
Imputing row 1301/1572 with 0 missing, elapsed time: 0.800
Imputing row 1401/1572 with 1 missing, elapsed time: 0.803
Imputing row 1501/1572 with 0 missing, elapsed time: 0.807
Imputing row 1/398 with 1 missing, elapsed time: 0.049
Imputing row 

In [16]:
X_train_prepared.shape

(1572, 109)

In [18]:
# associate columns 
features_names = ['body.temperature',
 'pulse',
 'respiration',
 'systolic.blood.pressure',
 'diastolic.blood.pressure',
 'weight',
 'BMI',
 'fio2',
 'left.ventricular.end.diastolic.diameter.LV',
 'creatinine.enzymatic.method',
 'urea',
 'uric.acid',
 'glomerular.filtration.rate',
 'cystatin',
 'monocyte.count',
 'red.blood.cell',
 'coefficient.of.variation.of.red.blood.cell.distribution.width',
 'standard.deviation.of.red.blood.cell.distribution.width',
 'mean.corpuscular.volume',
 'lymphocyte.count',
 'mean.hemoglobin.concentration',
 'mean.platelet.volume',
 'eosinophil.count',
 'hemoglobin',
 'platelet',
 'platelet.distribution.width',
 'neutrophil.count',
 'D.dimer',
 'activated.partial.thromboplastin.time',
 'thrombin.time',
 'prothrombin.activity',
 'prothrombin.time.ratio',
 'fibrinogen',
 'high.sensitivity.troponin',
 'carbon.dioxide.binding.capacity',
 'potassium',
 'chloride',
 'sodium',
 'glutamic.oxaloacetic.transaminase',
 'creatine.kinase',
 'creatine.kinase.isoenzyme',
 'lactate.dehydrogenase',
 'brain.natriuretic.peptide',
 'nucleotidase',
 'fucosidase',
 'albumin',
 'white.globulin.ratio',
 'glutamyltranspeptidase',
 'glutamic.pyruvic.transaminase',
 'indirect.bilirubin',
 'alkaline.phosphatase',
 'globulin',
 'direct.bilirubin',
 'total.bile.acid',
 'total.protein',
 'cholesterol',
 'low.density.lipoprotein.cholesterol',
 'triglyceride',
 'high.density.lipoprotein.cholesterol',
 'dischargeDay',
 'gender',
 'diabetes',
 'moderate.to.severe.chronic.kidney.disease',
 'return.to.emergency.department.within.6.months',
 'diuretics',
 'hypertension',
 'heart_failure',
 'angina_etal',
 'cholesterol_drug',
 'CCI.score',
 'eye.opening',
 'verbal.response',
 'movement',
 'GCS',
 'DestinationDischarge_HealthcareFacility',
 'DestinationDischarge_Home',
 'admission.ward_Cardiology',
 'admission.ward_GeneralWard',
 'admission.ward_ICU',
 'admission.ward_Others',
 'admission.way_Emergency',
 'admission.way_NonEmergency',
 'discharge.department_Cardiology',
 'discharge.department_GeneralWard',
 'discharge.department_ICU',
 'discharge.department_Others',
 'type.of.heart.failure_Both',
 'type.of.heart.failure_Left',
 'type.of.heart.failure_Right',
 'NYHA.cardiac.function.classification_II',
 'NYHA.cardiac.function.classification_III',
 'NYHA.cardiac.function.classification_IV',
 'Killip.grade_I',
 'Killip.grade_II',
 'Killip.grade_III',
 'Killip.grade_IV',
 'consciousness_Clear',
 'consciousness_Nonresponsive',
 'consciousness_ResponsiveToPain',
 'consciousness_ResponsiveToSound',
 'ageCat_(21,29]',
 'ageCat_(29,39]',
 'ageCat_(39,49]',
 'ageCat_(49,59]',
 'ageCat_(59,69]',
 'ageCat_(69,79]',
 'ageCat_(79,89]',
 'ageCat_(89,110]']

In [19]:
for idx,elem in enumerate(features_names):
    if elem == 'nan':
        print(idx)

In [20]:
X_train_prepared = np.delete(X_train_prepared, idx, axis = 1)
X_test_prepared = np.delete(X_test_prepared, idx, axis = 1)

## Random Forest classifier

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [10, 30, 50, 100], 
     'max_features': list(np.arange(10,X_train_prepared.shape[1],10)) + [X_train_prepared.shape[1]]} 
    # try 12 (3×4) combinations of hyperparameters
    

forest_clf = RandomForestClassifier(random_state=42)

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prepared, Y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_features': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100, 108],
                         'n_estimators': [10, 30, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [23]:
final_model = grid_search.best_estimator_
final_model

RandomForestClassifier(max_features=10, n_estimators=50, random_state=42)

### Fetaure importance

In [27]:
feature_importances = final_model.feature_importances_
feature_importances

array([4.01736422e-03, 5.59695687e-03, 3.42447110e-03, 8.29503038e-03,
       6.66907065e-03, 5.75969146e-03, 7.96067920e-03, 1.28637772e-03,
       1.07150687e-02, 1.38992782e-02, 9.11589929e-03, 1.24801654e-02,
       1.00528382e-02, 8.18844555e-03, 6.87669671e-03, 9.95123356e-03,
       6.01036339e-03, 6.63607008e-03, 8.10603670e-03, 9.21369990e-03,
       6.59754177e-03, 6.37227920e-03, 8.06196421e-03, 6.34964380e-03,
       8.86789955e-03, 5.66973956e-03, 6.66411103e-03, 1.35264062e-02,
       9.90739359e-03, 5.92736310e-03, 8.78063737e-03, 7.17062798e-03,
       8.34100346e-03, 1.01472214e-02, 7.19487422e-03, 9.94121224e-03,
       6.92062875e-03, 8.52541047e-03, 7.92628143e-03, 7.26485695e-03,
       7.52404403e-03, 6.89093562e-03, 7.33013643e-03, 6.20840647e-03,
       5.11268582e-03, 9.08589004e-03, 4.64346121e-03, 6.54250925e-03,
       4.96979142e-03, 8.04322930e-03, 7.78795048e-03, 7.29773563e-03,
       7.03792763e-03, 9.91046371e-03, 8.62642853e-03, 6.13473624e-03,
      

In [25]:
#cat_list_one_hot = full_pipeline.named_transformers_["cat"].categories_

In [28]:
#cat_features = []
#for elem in cat_list_one_hot:
#    cat_features.extend(elem)
#cat_features

In [29]:
sorted(zip(feature_importances,features_names), reverse=True)

[(0.49679140125342436, 'ageCat_(39,49]'),
 (0.013899278166925768, 'creatinine.enzymatic.method'),
 (0.013526406219388006, 'D.dimer'),
 (0.012579608295233934, 'dischargeDay'),
 (0.012480165350159722, 'uric.acid'),
 (0.010715068712037914, 'left.ventricular.end.diastolic.diameter.LV'),
 (0.010147221411030104, 'high.sensitivity.troponin'),
 (0.01005283820741374, 'glomerular.filtration.rate'),
 (0.009951233562655086, 'red.blood.cell'),
 (0.009941212237275267, 'potassium'),
 (0.009910463706014205, 'total.bile.acid'),
 (0.009907393587352563, 'activated.partial.thromboplastin.time'),
 (0.00921369990113698, 'lymphocyte.count'),
 (0.009115899294779626, 'urea'),
 (0.009085890038299906, 'albumin'),
 (0.008867899550259269, 'platelet'),
 (0.008780637370477297, 'prothrombin.activity'),
 (0.008754595263897437, 'triglyceride'),
 (0.008626428532870656, 'total.protein'),
 (0.00852541047090357, 'sodium'),
 (0.008341003461741677, 'fibrinogen'),
 (0.008295030377260742, 'systolic.blood.pressure'),
 (0.008188

### Predict on test set

In [30]:
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

0.1002509414234171