# Parameter tuning
Performs nested cross validation on the training set to select the best parameters for each model

Training set:

1.   MIMIC
2.   HEPAR




## MIMIC

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

from google.colab import drive
ROOT_PATH = '/content/drive'
drive.mount(ROOT_PATH)
ROOT_PATH += '/My Drive/XAI'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:

# Defining pipeline and classifiers
clf_dict = {'rf':RandomForestClassifier(random_state=1, n_jobs=-1),
            'nn':MLPClassifier(random_state=1,hidden_layer_sizes=(100, 50, 20), max_iter=1000),
            'lr':LogisticRegression(random_state=1, max_iter=500), 'gb':GradientBoostingClassifier(random_state=1), 
            }


clf_param = {'lr': {"penalty":["l2", "l1"], "C":[0.001, 0.5, 1], "solver":["liblinear"]},
             'rf':{"n_estimators":[100,500],
                   "max_features": ["auto", "sqrt"],
                   "max_depth": [10, 50, 100]},
             'gb':{"n_estimators": [ 100, 500],
                 "min_samples_leaf": [1, 4],
                 "max_features": ["auto", "sqrt"]},
             'nn':{"activation":["logistic", "relu"],
                   "solver":["adam", "sgd"],
                   "learning_rate":["constant", "adaptive"]}}



# cross val function
def run_nested_cv(X, y, clf_dict, clf_param, n_splits=5, scoring='f1'):
  r = {}
  cv_inner = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
  cv_outer = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
  for cl_name, clf in clf_dict.items():
    print(cl_name)
    if cl_name not in clf_param.keys():
      print(cl_name + ' has no param defined')
      continue
    pipe = Pipeline([('scaler', MinMaxScaler()), ('classifier',clf)])
    pgrid_new = {}
    for k,v in clf_param[cl_name].items():
      pgrid_new['classifier__'+k]=v
    print(pgrid_new)
    clf_search = GridSearchCV(pipe, param_grid=pgrid_new, scoring=scoring, cv=cv_inner)
    nested_score = cross_val_score(clf_search, X=X, y=y, cv=cv_outer)
    clf_search.fit(X, y)
    r[cl_name] = clf_search
  return r

In [None]:
#MIMIC PREPROCESSING
df_raw = pd.read_csv(ROOT_PATH + '/Full_Dataset.csv')

# Removal of features containing more than 90% of NaN
original_shape = df_raw.shape
print("DataFrame shape before NaN removal:", original_shape)
df = df_raw.dropna(thresh=round(df_raw.shape[0] * 0.90), axis=1)

# NaN removal by removing rows with at least one NaN value
# Check for NaN values
if df.isnull().any().any():  # If there's at least one column with NaN, the output is "True"
    old_row_size = original_shape[0]
    df = df.dropna(axis=0, how='any')
    print("DataFrame shape after NaN removal:", df.shape)
    new_row_size = df.shape[0]
    print("Relative percentage of removed rows (wrt the old row size): %.2f" % (
            (old_row_size - new_row_size) / old_row_size * 100))

# NaN values have now been removed, so the output should be "False"
print("Is there any NaN after NaN removal?", df.isnull().any().any())
df = df.drop(columns=["recordid"])

DataFrame shape before NaN removal: (6000, 121)
DataFrame shape after NaN removal: (5248, 48)
Relative percentage of removed rows (wrt the old row size): 12.53
Is there any NaN after NaN removal? False


In [None]:
#DATA PREPARATION
pred_col = 'In-hospital_death' #edit this line when using another dataset

X = df
y = X[pred_col]
X_feat = X.drop(columns=[pred_col])
feature_names = X_feat.columns

#check which features has less than 10 unique values to identify a subset of probable categorical features
print(feature_names[np.where([len(np.unique(X_feat.iloc[:,i]))<10 for i in range(X_feat.shape[1])])])
categorical_feat = ['CCU', 'CSU', 'SICU'] # edit this line when using another dataset
iscat = [x in categorical_feat for x in feature_names]

X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.30, random_state=42)

Index(['Gender', 'CCU', 'CSRU', 'SICU'], dtype='object')


In [None]:
r_mimic_ = run_nested_cv(X_train, y_train, clf_dict, clf_param)

rf
{'classifier__n_estimators': [100, 500], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [10, 50, 100]}
nn
{'classifier__activation': ['logistic', 'relu'], 'classifier__solver': ['adam', 'sgd'], 'classifier__learning_rate': ['constant', 'adaptive']}




lr
{'classifier__penalty': ['l2', 'l1'], 'classifier__C': [0.001, 0.5, 1], 'classifier__solver': ['liblinear']}
gb
{'classifier__n_estimators': [100, 500], 'classifier__min_samples_leaf': [1, 4], 'classifier__max_features': ['auto', 'sqrt']}


In [None]:
def save_best_param(fpath, r):
    with open(fpath, 'w') as o:
        o.write('##### BEST PARAM AFTER 5-FOLD NESTED CV\n')

        for clname, vparam in r.items():
            s = []
            for k,v in vparam.best_params_.items():
                s.append(k.split('__')[1] + '=' + str(v))
            o.write(clname.upper()+':'+'|'.join(s)+'\n')

In [None]:
save_best_param(ROOT_PATH + '/mimic_best_param.txt', r_mimic_)

In [None]:
df_raw_iid = pd.read_csv(ROOT_PATH + '/HEPAR_simulated_patients.csv')
df_raw_iid.drop("Unnamed: 0", axis=1, inplace=True)

def preproc_wrapper(df):
  # Categorical variables handled
  df_num = df.copy()
  """Handling categorical and nominal data"""

  nominal_feat = ['age', 'triglycerides', 'bilirubin', 'phosphatase', 'proteins', 'platelet', 
                  'inr', 'urea', 'ESR', 'alt', 'ast','amylase', 'ggtp', 'cholesterol', 'albumin']

  # preprocessing: data transformation
  nominal_dict = {}
  for n in nominal_feat:
    unique_val = df[n].unique().tolist()
    unique_num = [int(x.split('_')[1]) for x in unique_val]
    val2num = dict(zip(unique_val, unique_num))
    num2cat = dict(zip(sorted(unique_num), range(1, len(unique_num)+1)))
    dict_n = {}
    for k,v in val2num.items():
      dict_n[k] = num2cat[v]
    nominal_dict[n] = dict_n
  # print(nominal_dict) ***DEBUG to check that it is consistent between the two datasets (it is, but it might not if using different datasets --> do not copy paste)

  special_feat = ['ChHepatitis', 'sex', 'Cirrhosis']

  dict_chhepa = {'absent':0, 'active':1, 'persistent':2}
  dict_sex = {'female':1, 'male':2}
  dict_cirr = {'absent':0, 'decompensate':1, 'compensate':2}

  df_num['sex'] = [dict_sex[x] for x in df['sex']]
  df_num['ChHepatitis'] = [dict_chhepa[x] for x in df['ChHepatitis']]
  df_num['Cirrhosis'] = [dict_cirr[x] for x in df['Cirrhosis']]

  categorical_feat = [x for x in df.columns if x not in nominal_feat+special_feat]
  dict_cat = {'absent':0, 'present':1}

  #print(categorical_feat)
  for c in categorical_feat:
    #print(c)
    df_num[c] = [dict_cat[x] for x in df[c]]

  for n in nominal_feat:
    newcol = [nominal_dict[n][x] for x in df[n]]
    df_num[n] = newcol

  pred_col = 'hospital'

  df_num[pred_col].value_counts()

  np.random.seed(1)

  dataset_class_1 = df_num[df_num[pred_col]==1]
  dataset_class_0 = df_num[df_num[pred_col]==0]

  # selecting a subpopulation of X% that will be used for training and testing ***DEBUG
  # n_subpop = int(0.5*df_num.shape[0])
  # i_class_yes = np.random.randint(0, high=dataset_class_1.shape[0], size=int(n_subpop/2))
  # i_class_no = np.random.randint(0, high=dataset_class_0.shape[0], size=int(n_subpop/2))
  # X = dataset_class_1.iloc[i_class_yes].append(dataset_class_0.iloc[i_class_no])

  X = df_num
  y = X[pred_col]
  X_feat = X.drop(columns=[pred_col])
  feature_names = X_feat.columns
  iscat = [x in categorical_feat for x in feature_names]

  return X_feat, y, feature_names, iscat

X_iid, y_iid, feat_names_iid, iscat_iid = preproc_wrapper(df_raw_iid)

feature_names = feat_names_iid # = feat_names_ood
iscat = iscat_iid

X_train_hepar, X_test_hepar, y_train_hepar, y_test_hepar = train_test_split(X_iid, y_iid, test_size=0.3, random_state=6)

In [None]:
r_hepar = run_nested_cv(X_train_hepar, y_train_hepar, clf_dict, clf_param)

In [None]:
save_best_param(ROOT_PATH + '/hepar_best_param.txt', r_hepar)