#Requirements

!pip install pandas==1.1.5 

!pip install numpy==1.22.4 

!pip install scipy==1.7.3 

!pip install scikit-learn==1.1.1 

!pip install joblib==1.1.0 

!pip install pyyaml==5.4.1 

!pip install shap==0.38.1 

!pip install matplotlib==3.5.1 

!pip install xgboost==1.6.1 

!pip install joblib==1.1.0


In [None]:
import helpers_mi as helper
import pandas as pd
import numpy as np
from scipy import stats
import joblib

import yaml
from yaml.loader import SafeLoader

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline, FunctionTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer


from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)

import pickle


from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression

import shap 

import matplotlib.pyplot as plt
%matplotlib agg

In [None]:
#Seed for reproducibility
np.random.seed(0)

#Dataset file. If you have multiple datasets DO NOT add the number. It will be added later. 
#Example if your files are called cluster_mi1, cluster_mi2, etc. just type cluster_mi as the name. 
data_file = f'data/mi_data/ndipass_swe_mi.csv'


#Change this for the number of datasets your MICE procedure created
n_datasets = 50

#If the output needs imputing change this to True
impute_output = False

#If you have predetermined clusters for cross validation, this should be true
extint_val = False

#Name of the output variable you want to predict.
target_variable = 'ndipass'

#File to output the results
output_file       = 'data/results/validation/results_models_validation.csv'

threshold = 0.50


#Settings files
settings_file     = 'settings/settings_val.yml'
variable_names    = 'settings/variable_names.yml'
sample_file       = 'settings/sample_patients.yml'

#Model save
model_dump = 'savedmodels/'

In [None]:
helper.check_paths() #Check that all the folders needed are there

In [None]:
#Settings file with models and their hyperparameters
with open(settings_file, "r") as stream:
    settings = yaml.load(stream, Loader=SafeLoader)

#Variable names and labels for graphs
with open(variable_names, "r") as stream:
    var_names = yaml.load(stream, Loader=SafeLoader)
    
#Example patients to check for variable influence
with open(sample_file, 'r') as stream:
    samples = yaml.load(stream, Loader = SafeLoader)

In [None]:
train_results, test_results = helper.get_results_dicts(settings)

In [None]:
for d in range(n_datasets):
    
    #Getting the dataset for this run
    if n_datasets>1:
        dataset = data_file[:-4]
        dataset = dataset + f'{d+1}.csv'
    else:
        dataset = data_file
    df, x, y, clusters_idx = helper.get_data(dataset, target_variable, impute_output = impute_output, extint_val = extint_val)

    #Getting the sample patients
    sample_x = pd.DataFrame(columns = x.columns)

    for patient, values in samples.items():
        sample_x = sample_x.append(values, ignore_index = True)


    for model_name, attr in settings['models'].items():

        print(model_name)
        full_model = joblib.load(model_dump + model_name + '_cv.joblib')
        pipe = full_model['preprocessor']
        model = full_model['predictor']
        proba_df_test = pd.DataFrame(columns = ['cluster','Test', 'Predicted', 'Proba'])

        numerical_features, categorical_features = helper.feature_discrimination(x,settings)

        sample_m = sample_x[x.columns].copy()

        #Changing the order of features to match it to the ones after the pipeline - This is done for SHAP purposes
        x_ = x.loc[:, numerical_features.to_list()+categorical_features.to_list()].copy()
        sample_m = sample_m.loc[:, numerical_features.to_list()+categorical_features.to_list()]

        #List for example patients
        y_sample_test = []

        #Divide the datasets into train-test with respect to pre-defined clusters
        for cluster_name in settings['clusters']:
            n_cluster = settings['clusters'][cluster_name]
            print(f'Testing Cluster N{n_cluster}, {cluster_name}')

            x_test  = x_[clusters_idx == n_cluster].copy()
            y_test  =  y[clusters_idx == n_cluster].copy()

            test_results[model_name][cluster_name]['shap_df'].append(x_test)

            x_test  = pipe.transform(x_test)
            sample_model = pipe.transform(sample_m)

            features = list(numerical_features) + list(pipe['preprocessor'].transformers_[1][1]['one-hot'].get_feature_names_out(categorical_features))
            x_test = pd.DataFrame(x_test, columns = features)
            sample_model = pd.DataFrame(sample_model, columns = features)

            y_proba_test  = model.predict_proba(x_test)[:,1]    

            y_sample_test.append(model.predict_proba(sample_model)[:,1])

            
            y_pred_test  = helper.get_prediction(y_test, y_proba_test, threshold)
            
            #Compute SHAP if needed - Only for first dataset
            if (settings['models'][model_name]['shap'] == True) & (d == 0):
                print("Getting SHAP values")
                shap_type = settings['models'][model_name]['shap_type']
                test_results[model_name][cluster_name] = helper.compute_SHAP(model, 
                                                                             x_test,
                                                                             test_results[model_name][cluster_name],
                                                                             categorical_features,
                                                                             shap_type
                                                                            )

            test_results[model_name][cluster_name] = helper.get_ROC_cluster_mi(model, 
                                                                               x_test, 
                                                                               y_test, 
                                                                               test_results, 
                                                                               model_name, 
                                                                               cluster_name)

            test_results[model_name][cluster_name] = helper.get_calibration(y_test,
                                                                            y_proba_test, 
                                                                            test_results,
                                                                            model_name,
                                                                            cluster_name)

            score = ''
            for score_name, attr in settings['scoring_test'].items():
                score = helper.get_scoring(score_name, attr)
                score_value = eval(score)
                test_results[model_name][cluster_name][score_name].append(score_value)

                SE = helper.get_SE(score_value, y_test)
                test_results[model_name][cluster_name][score_name + '_SE'].append(SE)

                CI = helper.get_CI(SE, y_test.shape[0])
                test_results[model_name][cluster_name][score_name + '_CI'].append(CI)


            proba_df = pd.DataFrame(columns = ['cluster','Test', 'Predicted', 'Proba'])
            proba_df['Test']      = y_test
            proba_df['Predicted'] = y_pred_test
            proba_df['Proba']     = y_proba_test
            proba_df['cluster']   = clusters_idx
            test_results[model_name][cluster_name]['Proba_df'].append(proba_df)

            proba_df_test = pd.concat([proba_df_test, proba_df])


        sample_x[f"{model_name} "] = np.mean(y_sample_test, axis = 0)

        proba_df_test.to_csv(f'data/results/validation/{model_name}_cluster_mi{d+1}_Proba_Test.csv', sep=';', index=False)
    
sample_x.to_csv('data/results/validation/Samples.csv', sep=';')

In [None]:
def merge_results(tr_res,te_res, settings_):
    tr_results_merged, te_results_merged = helper.get_results_dicts(settings_)
    
    for model_name in settings_['models']:
        for cluster_name in settings_['clusters']:
            for score_name in settings_['scoring_train']:

                scores = te_res[model_name][cluster_name][score_name]
                standard_errors = te_res[model_name][cluster_name][score_name + '_SE']
                conf_int = te_res[model_name][cluster_name][score_name + '_CI']
                merged_score, merged_SE, CI = helper.apply_rubin_rule(scores, standard_errors, conf_int)

                te_results_merged[model_name][cluster_name][score_name].append(merged_score)
                te_results_merged[model_name][cluster_name][score_name + '_SE'].append(merged_SE)
                te_results_merged[model_name][cluster_name][score_name + '_CI'].append(CI)
                
            for item in ['Intercept','Slope']:
                
                scores = te_res[model_name][cluster_name][item]
                standard_errors = te_res[model_name][cluster_name][item + '_SE']
                conf_int = te_res[model_name][cluster_name][item + '_CI']
                merged_score, merged_SE, CI = helper.apply_rubin_rule(scores, standard_errors, conf_int)

                te_results_merged[model_name][cluster_name][item].append(merged_score)
                te_results_merged[model_name][cluster_name][item + '_SE'].append(merged_SE)
                te_results_merged[model_name][cluster_name][item + '_CI'].append(CI)

            

            te_results_merged[model_name][cluster_name]['Proba_df'].append(pd.DataFrame(np.mean(te_res[model_name][cluster_name]['Proba_df'], axis=0), 
                 index = te_res[model_name][cluster_name]['Proba_df'][0].index,
                 columns = te_res[model_name][cluster_name]['Proba_df'][0].columns))

            
            if settings_['models'][model_name]['shap']==True:
                shap_list = []
                
                for fold in range(len(te_res[model_name][cluster_name]['shap_values'])):
                    shap_list.append(te_res[model_name][cluster_name]['shap_values'][fold].values)
                    
                te_results_merged[model_name][cluster_name]['shap_values'].append(np.mean(shap_list, axis=0))

                te_results_merged[model_name][cluster_name]['shap_df'].append(pd.DataFrame(np.mean(te_res[model_name][cluster_name]['shap_df'], axis=0), 
                     index = te_res[model_name][cluster_name]['shap_df'][0].index,
                     columns = te_res[model_name][cluster_name]['shap_df'][0].columns))
                te_results_merged[model_name][cluster_name]['shap_explainer'].append(te_res[model_name][cluster_name]['shap_explainer'][0])
    return tr_results_merged, te_results_merged

In [None]:
#Merging the results from the n-datasets
train_results_merged, test_results_merged = merge_results(train_results, test_results, settings)


In [None]:
reform = {(outerKey, innerKey): values for outerKey, innerDict in test_results_merged.items() for innerKey, values in innerDict.items()}
rows = pd.MultiIndex.from_tuples(list(reform.keys()), names = ['Model','Region'])

to_keep = ['roc_auc','Intercept','Slope','precision','NPV']
columns = []
for item in to_keep:
    columns.append(item)
    columns.append(item + '_CI')
    columns.append(item + '_SE')


train_results_df = pd.DataFrame(columns = columns,
                                index = rows)
test_results_df = pd.DataFrame(columns = columns,
                               index = rows)

for model_name in settings['models']:
    for cluster_name in settings['clusters']:
        for item in to_keep:
            test_results_df.loc[(model_name,cluster_name),[item]] = round(test_results_merged[model_name][cluster_name][item][0],4)

            test_results_df.loc[(model_name,cluster_name),[item + '_SE']] = round(test_results_merged[model_name][cluster_name][item + '_SE'][0],4)

            test_results_df.loc[(model_name,cluster_name),[item + '_CI']] = round(test_results_merged[model_name][cluster_name][item + '_CI'][0],4)

    test_results_df.loc[(model_name,'Overall'),:] = round(test_results_df.loc[(model_name)].mean(),4)
            


In [None]:
for model_name, attr in settings['models'].items():  
    #Comment this to avoid SHAP
    print(model_name)
    if settings['models'][model_name]['shap'] == True:
        print('Getting SHAP Plots')
        test_results[model_name] = helper.get_SHAP_plot_validation(test_results, model_name, var_names,settings, cols = 3)
    print('Getting ROC Plots')
    test_results_merged[model_name] = helper.get_ROC_plot_validation(test_results_merged, test_results, model_name, settings)
    

In [None]:
results_df = pd.concat([train_results_df, test_results_df], keys = ['Train', 'Test'], axis = 0)
results_df = results_df.reset_index(level= [2])
results_df.columns.values[0] = 'Region'

results_df = results_df.reset_index(level= [1])
results_df.columns.values[0] = 'Model'
results_df

In [None]:
results_df.to_csv(output_file, sep =';')
