## Importin libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(130298) 

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from sklearn.compose import ColumnTransformer

In [14]:
from fancyimpute import IterativeImputer, KNN

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
def compute_metrics(Y, final_pred):
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    #print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

In [6]:
%pwd

'/Users/annachiararossi/Documents/Healthcare data/project_local'

## Datasets definition

In [7]:
outcome_name = 're.admission.within.6.months'

# take the training set
X_train = pd.read_csv('train_data_drugs.csv')
X_train.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_train = X_train[outcome_name].copy()
X_train.drop(columns = outcome_name, inplace = True)

# take the test set
X_test = pd.read_csv('test_data_drugs.csv')
X_test.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_test = X_test[outcome_name].copy()
X_test.drop(columns = outcome_name, inplace = True)


In [8]:
# now we should adjust the lists of variables 

cat_columns = ['DestinationDischarge','admission.ward','admission.way','discharge.department',
                       'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade',
                       'consciousness', 'ageCat']

ordinal_columns = ['CCI.score', 'eye.opening','verbal.response', 'movement', 'GCS']

not_continuous = cat_columns.copy()

binary_columns = ['gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  're.admission.within.28.days', 're.admission.within.3.months', 
                  'return.to.emergency.department.within.6.months', 'diuretics',
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol_drug']

not_continuous.extend(binary_columns)
not_continuous.extend(ordinal_columns)

In [10]:
cont_columns = [col_name for col_name in X_train.columns if col_name not in not_continuous]

In [23]:
var_to_log = ['creatinine.enzymatic.method', 'urea', 'glomerular.filtration.rate', 
              'cystatin', 'lymphocyte.count', 'neutrophil.count',
              'activated.partial.thromboplastin.time', 'prothrombin.time.ratio',
              'glutamyltranspeptidase','indirect.bilirubin','alkaline.phosphatase',
              'globulin','direct.bilirubin','cholesterol',
              'low.density.lipoprotein.cholesterol','triglyceride']

## Pipeline definition

In [46]:
def log_transf(x):
    return np.log(x)

log_transformer = FunctionTransformer(log_transf)
#knn_transformer = FunctionTransformer(KNN)

In [53]:
starting_pipeline = ColumnTransformer([
        ("log", log_transformer, var_to_log),
        ("num", StandardScaler(), cont_columns),     
        ("cat", OneHotEncoder(), cat_columns)],
        remainder = 'passthrough')

full_pipeline = Pipeline([('stationarity', starting_pipeline),
                          ('imputation',KNN())])

X_train_prepared = full_pipeline.fit_transform(X_train)

Imputing row 1/1572 with 0 missing, elapsed time: 0.837
Imputing row 101/1572 with 1 missing, elapsed time: 0.841
Imputing row 201/1572 with 1 missing, elapsed time: 0.847
Imputing row 301/1572 with 1 missing, elapsed time: 0.852
Imputing row 401/1572 with 1 missing, elapsed time: 0.857
Imputing row 501/1572 with 0 missing, elapsed time: 0.861
Imputing row 601/1572 with 0 missing, elapsed time: 0.867
Imputing row 701/1572 with 0 missing, elapsed time: 0.873
Imputing row 801/1572 with 3 missing, elapsed time: 0.878
Imputing row 901/1572 with 4 missing, elapsed time: 0.883
Imputing row 1001/1572 with 7 missing, elapsed time: 0.888
Imputing row 1101/1572 with 4 missing, elapsed time: 0.892
Imputing row 1201/1572 with 0 missing, elapsed time: 0.896
Imputing row 1301/1572 with 0 missing, elapsed time: 0.901
Imputing row 1401/1572 with 1 missing, elapsed time: 0.905
Imputing row 1501/1572 with 0 missing, elapsed time: 0.909


In [54]:
X_test_prepared = full_pipeline.transform(X_test) 

ValueError: KNN.transform not implemented! This imputation algorithm likely doesn't support inductive mode. Only KNN.fit_transform is supported at this time.

In [55]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, 
    # try 12 (3×4) combinations of hyperparameters
    
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, 
    # then try 6 (2×3) combinations with bootstrap set as False
  ]

forest_clf = RandomForestClassifier(random_state=42)

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prepared, Y_train)



GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [42]:
final_model = grid_search.best_estimator_
final_model

RandomForestClassifier(max_features=8, n_estimators=30, random_state=42)

In [None]:
final_predictions = final_model.predict(X_test_prepared)
    metrics = compute_metrics(Y = Y_test[1:], final_pred = final_predictions)