## Importing necessary libraries

In [1]:
%pwd

'/Users/annachiararossi/Documents/Healthcare data/heart-failure-project'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(130298) 

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from fancyimpute import IterativeImputer, KNN

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, f1_score, recall_score, precision_score, roc_curve, mean_squared_error

In [5]:
from sklearn.model_selection import GridSearchCV

In [6]:
def compute_metrics(Y, final_pred):
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

## Datasets definition

In [7]:
outcome_name = 're.admission.within.6.months'

# take the training set
X_train = pd.read_csv('train_data_drugs.csv')
X_train.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_train = X_train[outcome_name].copy()
X_train.drop(columns = outcome_name, inplace = True)

# take the test set
X_test = pd.read_csv('test_data_drugs.csv')
X_test.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_test = X_test[outcome_name].copy()
X_test.drop(columns = outcome_name, inplace = True)


In [8]:
print(f'Size of traing set: {X_train.shape} and of test set: {X_test.shape}')

Size of traing set: (1573, 82) and of test set: (397, 82)


In [9]:
# Log transformations
var_to_log = ['creatinine.enzymatic.method', 'urea', 'glomerular.filtration.rate', 
              'cystatin', 'lymphocyte.count', 'neutrophil.count',
              'activated.partial.thromboplastin.time', 'prothrombin.time.ratio',
              'glutamyltranspeptidase','indirect.bilirubin','alkaline.phosphatase',
              'globulin','direct.bilirubin', 'low.density.lipoprotein.cholesterol', 
              'triglyceride']
X_train[var_to_log] = np.log(X_train[var_to_log])
X_test[var_to_log] = np.log(X_test[var_to_log])

In [10]:
# take the lists of variables by type

cat_columns = ['DestinationDischarge','admission.ward','admission.way','discharge.department',
                       'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade',
                       'consciousness', 'ageCat']

ordinal_columns = ['CCI.score', 'eye.opening','verbal.response', 'movement', 'GCS']

not_continuous = cat_columns.copy()

binary_columns = ['gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  'return.to.emergency.department.within.6.months', 'diuretics',
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol']

not_continuous.extend(binary_columns)
not_continuous.extend(ordinal_columns)

In [11]:
cont_columns = [col_name for col_name in X_train.columns if col_name not in not_continuous]

In [12]:
# for each category we add its specification, needed to retreive columns after OneHotEcoding
#for cat in cat_columns:
#    X_train[cat] = cat + '_' + X_train[cat]
#    X_test[cat] = cat + '_' + X_test[cat]

## Pipeline definition

In [13]:
#def log_transf_stand(x):
#    "Transform to log and standardize"
#    x = np.log(x)
#    return (x - x.mean())/x.std()

#log_transformer = FunctionTransformer(log_transf_stand)

In [14]:
#full_pipeline = ColumnTransformer([
#        ("log", log_transformer, var_to_log),
#        ("num", StandardScaler(), list(set(cont_columns) - set(var_to_log))),     
#        ("cat", OneHotEncoder(), cat_columns)],
#       remainder = 'passthrough')

In [15]:
full_pipeline = ColumnTransformer([
                ("num", StandardScaler(),cont_columns),
                ("cat", OneHotEncoder(), cat_columns)],
                remainder = 'passthrough') # do not modify columns not listed

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test) 

In [20]:
full_pipeline

ColumnTransformer(remainder='passthrough',
                  transformers=[('num', StandardScaler(),
                                 ['body.temperature', 'pulse', 'respiration',
                                  'systolic.blood.pressure',
                                  'diastolic.blood.pressure', 'weight', 'BMI',
                                  'fio2',
                                  'left.ventricular.end.diastolic.diameter.LV',
                                  'creatinine.enzymatic.method', 'urea',
                                  'uric.acid', 'glomerular.filtration.rate',
                                  'cystatin', 'monocyte.count',
                                  'r...
                                  'hemoglobin', 'platelet',
                                  'platelet.distribution.width',
                                  'neutrophil.count', 'D.dimer',
                                  'activated.partial.thromboplastin.time',
                                  'throm

In [16]:
column_prepared = full_pipeline.get_feature_names_out(input_features=X_train.columns)

AttributeError: 'ColumnTransformer' object has no attribute 'get_feature_names_out'

In [None]:
X_train_prepared=pd.DataFrame(X_train_prepared, columns=column_prepared, index=X_train.index)
X_train_prepared.head()

In [None]:
X_test_prepared=pd.DataFrame(X_test_prepared, columns=column_prepared, index=X_test.index)
X_test_prepared.head()

In [None]:
X_train_prepared.drop(columns = "cat__DestinationDischarge_nan",inplace=True)
X_test_prepared.drop(columns = "cat__DestinationDischarge_nan",inplace=True)
column_prepared.remove("cat__DestinationDischarge_nan")

In [None]:
print(f'Size of traing set: {X_train_prepared.shape} and of test set: {X_test_prepared.shape}')

In [None]:
# imputation cannot be done inside the Pipeline because the method 'transform' is missing

knn_imputer = KNN()
X_train_prepared = knn_imputer.fit_transform(X_train_prepared)
X_test_prepared = knn_imputer.fit_transform(X_test_prepared)

In [None]:
print(f'Size of traing set: {X_train_prepared.shape} and of test set: {X_test_prepared.shape}')

In [None]:
#for idx,elem in enumerate(features_names):
#    if elem == 'nan':
#        print(idx)

In [None]:
#X_train_prepared = np.delete(X_train_prepared, idx, axis = 1)
#X_test_prepared = np.delete(X_test_prepared, idx, axis = 1)

# ADJUST THIS!!!

## Logistic Regression with Elasticnet penalty
Since we have many columns, of which we doubt some might still be collinear, we use an Elastic net penalty, which takes into account both the L2 norm and the L1 norm, which induces sparsity.

Moreover, we add the “balanced” mode, which uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).

In [None]:
log_reg_clf = LogisticRegression(penalty = 'elasticnet', solver = 'saga', 
                                 class_weight = 'balanced', l1_ratio = 0.5, 
                                 max_iter = 700, random_state=42)
# this method takes many iterations to converge
log_reg_clf.fit(X_train_prepared, Y_train)

In [None]:
pred = log_reg_clf.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, pred)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
compute_metrics(Y_test, pred)

In [None]:
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(log_reg_clf, prefit=True)
X_new = model.transform(X_train_prepared)
X_new.shape

In [None]:
significant_features=model.get_feature_names_out(input_features=column_prepared)
significant_features

In [None]:
# TO GET THE NAMES OF THE IMPORTANT FEATURES, WE HAVE TO PASS THE MATRIX WITH THE COLUMN NAMES

In [None]:
# AFTER THIS, FIT THE RANDOM FOREST ONLY WITH THE GOOD FEATURES!!

In [None]:
#log_reg_clf = LogisticRegression(penalty = 'elasticnet', solver = 'saga', class_weight = 'balanced', max_iter = 1000, random_state=42)

# C and l1_ratio can be tuned
#'C': np.linspace(0.1,5,100),
#param_grid = {'l1_ratio': np.linspace(0,1,101)}
#grid_search = GridSearchCV(log_reg_clf, param_grid, cv=10,
#                           scoring='neg_mean_squared_error',
#                           return_train_score=True)
#grid_search.fit(X_train_prepared, Y_train)

## Random Forest classifier

In [None]:
param_grid = {'n_estimators': [10, 30, 50, 100], 
     'max_features': list(np.arange(10,X_train_prepared.shape[1],10)) + [X_train_prepared.shape[1]]} 
    # try 12 (3×4) combinations of hyperparameters
    

forest_clf = RandomForestClassifier(random_state=42)

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prepared, Y_train)

In [None]:
final_model = grid_search.best_estimator_
final_model

### Fetaure importance

In [None]:
feature_importances = final_model.feature_importances_
feature_importances

In [None]:
#cat_list_one_hot = full_pipeline.named_transformers_["cat"].categories_

In [None]:
#cat_features = []
#for elem in cat_list_one_hot:
#    cat_features.extend(elem)
#cat_features

In [None]:
sorted(zip(feature_importances,features_names), reverse=True)

### Predict on test set

In [None]:
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse