## Importin libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(130298) 

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import auc, f1_score, recall_score, precision_score,roc_curve, mean_squared_error

In [3]:
from fancyimpute import IterativeImputer, KNN

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
def compute_metrics(Y, final_pred):
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    #print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

In [6]:
%pwd

'/Users/annachiararossi/Documents/Healthcare data/heart-failure-project'

## Datasets definition

In [7]:
outcome_name = 're.admission.within.6.months'

# take the training set
X_train = pd.read_csv('train_data_drugs.csv')
X_train.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_train = X_train[outcome_name].copy()
X_train.drop(columns = outcome_name, inplace = True)

# take the test set
X_test = pd.read_csv('test_data_drugs.csv')
X_test.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_test = X_test[outcome_name].copy()
X_test.drop(columns = outcome_name, inplace = True)


In [9]:
# now we should adjust the lists of variables 

cat_columns = ['DestinationDischarge','admission.ward','admission.way','discharge.department',
                       'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade',
                       'consciousness', 'ageCat']

ordinal_columns = ['CCI.score', 'eye.opening','verbal.response', 'movement', 'GCS']

not_continuous = cat_columns.copy()

binary_columns = ['gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  're.admission.within.28.days', 're.admission.within.3.months', 
                  'return.to.emergency.department.within.6.months', 'diuretics',
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol_drug']

not_continuous.extend(binary_columns)
not_continuous.extend(ordinal_columns)

In [10]:
for cat in cat_columns:
    X_train[cat] = cat + '_' + X_train[cat]
    X_test[cat] = cat + '_' + X_test[cat]

In [11]:
cont_columns = [col_name for col_name in X_train.columns if col_name not in not_continuous]

In [12]:
var_to_log = ['creatinine.enzymatic.method', 'urea', 'glomerular.filtration.rate', 
              'cystatin', 'lymphocyte.count', 'neutrophil.count',
              'activated.partial.thromboplastin.time', 'prothrombin.time.ratio',
              'glutamyltranspeptidase','indirect.bilirubin','alkaline.phosphatase',
              'globulin','direct.bilirubin','cholesterol',
              'low.density.lipoprotein.cholesterol','triglyceride']

## Pipeline definition

In [13]:
def log_transf(x):
    return np.log(x)

log_transformer = FunctionTransformer(log_transf)
#knn_transformer = FunctionTransformer(KNN)

In [14]:
full_pipeline = ColumnTransformer([
        ("log", log_transformer, var_to_log),
        ("num", StandardScaler(), cont_columns),     
        ("cat", OneHotEncoder(), cat_columns)],
        remainder = 'passthrough')

#full_pipeline = Pipeline([('stationarity', starting_pipeline)])

#('imputation',KNN())
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test) 

In [17]:
knn_imputer = KNN()
X_train_prepared = knn_imputer.fit_transform(X_train_prepared)
X_test_prepared = knn_imputer.fit_transform(X_test_prepared)

Imputing row 1/1572 with 0 missing, elapsed time: 0.855
Imputing row 101/1572 with 1 missing, elapsed time: 0.860
Imputing row 201/1572 with 1 missing, elapsed time: 0.865
Imputing row 301/1572 with 1 missing, elapsed time: 0.871
Imputing row 401/1572 with 1 missing, elapsed time: 0.876
Imputing row 501/1572 with 0 missing, elapsed time: 0.880
Imputing row 601/1572 with 0 missing, elapsed time: 0.887
Imputing row 701/1572 with 0 missing, elapsed time: 0.893
Imputing row 801/1572 with 3 missing, elapsed time: 0.897
Imputing row 901/1572 with 4 missing, elapsed time: 0.903
Imputing row 1001/1572 with 7 missing, elapsed time: 0.908
Imputing row 1101/1572 with 4 missing, elapsed time: 0.912
Imputing row 1201/1572 with 0 missing, elapsed time: 0.917
Imputing row 1301/1572 with 0 missing, elapsed time: 0.922
Imputing row 1401/1572 with 1 missing, elapsed time: 0.925
Imputing row 1501/1572 with 0 missing, elapsed time: 0.930
Imputing row 1/398 with 1 missing, elapsed time: 0.053
Imputing row 

## Random Forest classifier

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [10, 30, 50, 100], 
     'max_features': list(np.arange(10,X_train_prepared.shape[1],10)) + [X_train_prepared.shape[1]]} 
    # try 12 (3×4) combinations of hyperparameters
    

forest_clf = RandomForestClassifier(random_state=42)

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prepared, Y_train)



GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_features': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100, 110, 120, 127],
                         'n_estimators': [10, 30, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [19]:
final_model = grid_search.best_estimator_
final_model

RandomForestClassifier(max_features=20, n_estimators=30, random_state=42)

### Fetaure importance

In [20]:
feature_importances = final_model.feature_importances_
feature_importances

array([2.70209535e-03, 1.35649970e-03, 3.66477111e-03, 1.91013630e-03,
       1.64117525e-03, 1.88582659e-03, 2.85664935e-03, 3.16411005e-03,
       1.75968980e-03, 2.20460570e-03, 9.39320733e-04, 1.78962388e-03,
       1.54671042e-03, 2.22948060e-03, 2.65780512e-03, 1.49985101e-03,
       1.49042096e-03, 2.79230759e-03, 2.95509426e-04, 1.02118626e-03,
       9.78632007e-04, 1.58097546e-03, 1.61292933e-03, 1.07459426e-03,
       6.56984621e-03, 3.39531828e-03, 2.32187529e-03, 5.92540077e-03,
       5.52226493e-03, 1.22847152e-03, 7.68631458e-04, 1.63306394e-03,
       1.10504072e-03, 3.82747617e-03, 3.76777252e-03, 2.30061964e-03,
       3.21114995e-03, 8.93413650e-04, 6.54375249e-03, 1.17210802e-03,
       2.75897651e-03, 2.19183064e-03, 2.74615564e-03, 3.27287143e-03,
       3.34052895e-03, 2.54178064e-03, 3.67043485e-03, 8.09525553e-04,
       2.28592355e-03, 3.02553874e-03, 2.90831900e-03, 4.25440843e-03,
       3.21218259e-03, 2.48979068e-03, 2.51960743e-03, 2.02283119e-03,
      

In [22]:
cat_list_one_hot = full_pipeline.named_transformers_["cat"].categories_

In [32]:
cat_features = []
for elem in cat_list_one_hot:
    cat_features.extend(elem)
cat_features

['DestinationDischarge_HealthcareFacility',
 'DestinationDischarge_Home',
 nan,
 'admission.ward_Cardiology',
 'admission.ward_GeneralWard',
 'admission.ward_ICU',
 'admission.ward_Others',
 'admission.way_Emergency',
 'admission.way_NonEmergency',
 'discharge.department_Cardiology',
 'discharge.department_GeneralWard',
 'discharge.department_ICU',
 'discharge.department_Others',
 'type.of.heart.failure_Both',
 'type.of.heart.failure_Left',
 'type.of.heart.failure_Right',
 'NYHA.cardiac.function.classification_II',
 'NYHA.cardiac.function.classification_III',
 'NYHA.cardiac.function.classification_IV',
 'Killip.grade_I',
 'Killip.grade_II',
 'Killip.grade_III',
 'Killip.grade_IV',
 'consciousness_Clear',
 'consciousness_Nonresponsive',
 'consciousness_ResponsiveToPain',
 'consciousness_ResponsiveToSound',
 'ageCat_(21,29]',
 'ageCat_(29,39]',
 'ageCat_(39,49]',
 'ageCat_(49,59]',
 'ageCat_(59,69]',
 'ageCat_(69,79]',
 'ageCat_(79,89]',
 'ageCat_(89,110]']

In [None]:
# associate columns 
features_names = 

In [29]:
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
import seaborn as sns
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(20),
           palette=df.head(20)["colors"])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)