In [None]:
#Import necessary modules 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import statistics
from statistics import mean
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

from sklearn import svm

from sklearn import metrics 
from sklearn.metrics import multilabel_confusion_matrix 
from sklearn.metrics import plot_confusion_matrix 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import cohen_kappa_score 
from statsmodels.stats.inter_rater import fleiss_kappa 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Connect to HiRID database

import psycopg2
from psycopg2 import Error

#Connect to HiRID
conn = psycopg2.connect(user="mimicuser",
                                  password="knowlabMIMIC",
                                  host="172.17.0.1",
                                  port="5433",
                                  database="HiRID")

#Cursor 
cur = conn.cursor()

## 1. Import Training Datasets

Note: All annotated datasets were provided by the data controller (Prof. Malcolm Sim) as excel files. In this section, all datasets are imported in their raw format.

In [None]:
#Define funtion to add numeric label columns to all 11 QEUH annotated datasets

def num_labels(df):

    #Add numeric multiclass Annotation column
    df['Annotation_Num'] = 0
    df.loc[df['Annotation'] == 'A', 'Annotation_Num'] = 0
    df.loc[df['Annotation'] == 'B', 'Annotation_Num'] = 1
    df.loc[df['Annotation'] == 'C', 'Annotation_Num'] = 2
    df.loc[df['Annotation'] == 'D', 'Annotation_Num'] = 3
    df.loc[df['Annotation'] == 'E', 'Annotation_Num'] = 4

    #Create binary class column: A=0, B/C/D/E = 1
    df['Ann_Bin_A'] = 0
    df.loc[df['Annotation'] == 'A', 'Ann_Bin_A'] = 0
    df.loc[df['Annotation'] == 'B', 'Ann_Bin_A'] = 1
    df.loc[df['Annotation'] == 'C', 'Ann_Bin_A'] = 1
    df.loc[df['Annotation'] == 'D', 'Ann_Bin_A'] = 1
    df.loc[df['Annotation'] == 'E', 'Ann_Bin_A'] = 1

    #Create binary class column: A/B = 0, C/D/E = 1
    df['Ann_Bin_B'] = 0
    df.loc[df['Annotation'] == 'A', 'Ann_Bin_B'] = 0
    df.loc[df['Annotation'] == 'B', 'Ann_Bin_B'] = 0
    df.loc[df['Annotation'] == 'C', 'Ann_Bin_B'] = 1
    df.loc[df['Annotation'] == 'D', 'Ann_Bin_B'] = 1
    df.loc[df['Annotation'] == 'E', 'Ann_Bin_B'] = 1

    #Create binary class column: A/B/C = 0, D/E = 1
    df['Ann_Bin_C'] = 0
    df.loc[df['Annotation'] == 'A', 'Ann_Bin_C'] = 0
    df.loc[df['Annotation'] == 'B', 'Ann_Bin_C'] = 0
    df.loc[df['Annotation'] == 'C', 'Ann_Bin_C'] = 0
    df.loc[df['Annotation'] == 'D', 'Ann_Bin_C'] = 1
    df.loc[df['Annotation'] == 'E', 'Ann_Bin_C'] = 1
    
    return df

In [None]:
#Import Consultant no.1 dataset

c1 = pd.read_excel('./p01.xlsx').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c1 = c1.drop(columns = cols)
c1 = c1.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c1['Adrenaline'] = c1['Adrenaline'].replace(np.nan, 0)
c1['Noradrenaline'] = c1['Noradrenaline'].replace(np.nan, 0)

c1 = num_labels(c1)

print(c1.shape)
c1.head()

In [None]:
#Import Consultant no.2 dataset

c2 = pd.read_csv('./p02.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c2 = c2.drop(columns = cols)
c2 = c2.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c2['Adrenaline'] = c2['Adrenaline'].replace(np.nan, 0)
c2['Noradrenaline'] = c2['Noradrenaline'].replace(np.nan, 0)

c2 = num_labels(c2)

print(c2.shape)
c2.head()

In [None]:
#Import Consultant no.3 dataset

c3 = pd.read_csv('./p03.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c3 = c3.drop(columns = cols)
c3 = c3.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c3['Adrenaline'] = c3['Adrenaline'].replace(np.nan, 0)
c3['Noradrenaline'] = c3['Noradrenaline'].replace(np.nan, 0)

c3 = num_labels(c3)

print(c3.shape)
c3.head()

In [None]:
#Import Consultant no.4 dataset

c4 = pd.read_excel('./p04.xlsx').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c4 = c4.drop(columns = cols)
c4 = c4.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c4['Adrenaline'] = c4['Adrenaline'].replace(np.nan, 0)
c4['Noradrenaline'] = c4['Noradrenaline'].replace(np.nan, 0)

c4 = num_labels(c4)

print(c4.shape)
c4.head()

In [None]:
#Import Consultant no.5 dataset

c5 = pd.read_csv('./p05.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c5 = c5.drop(columns = cols)
c5 = c5.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c5['Adrenaline'] = c5['Adrenaline'].replace(np.nan, 0)
c5['Noradrenaline'] = c5['Noradrenaline'].replace(np.nan, 0)

c5 = num_labels(c5)

print(c5.shape)
c5.head()

In [None]:
#Import Consultant no.6 dataset

c6 = pd.read_excel('./p06.xlsx').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c6 = c6.drop(columns = cols)
c6 = c6.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c6['Adrenaline'] = c6['Adrenaline'].replace(np.nan, 0)
c6['Noradrenaline'] = c6['Noradrenaline'].replace(np.nan, 0)

c6 = num_labels(c6)

print(c6.shape)
c6.head()

In [None]:
#Import Consultant no.7 dataset

c7 = pd.read_csv('./p07.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c7 = c7.drop(columns = cols)
c7 = c7.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c7['Adrenaline'] = c7['Adrenaline'].replace(np.nan, 0)
c7['Noradrenaline'] = c7['Noradrenaline'].replace(np.nan, 0)

c7 = num_labels(c7)

print(c7.shape)
c7.head()

In [None]:
#Import Consultant no.8 dataset

c8 = pd.read_csv('./p08.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c8 = c8.drop(columns = cols)
c8 = c8.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c8['Adrenaline'] = c8['Adrenaline'].replace(np.nan, 0)
c8['Noradrenaline'] = c8['Noradrenaline'].replace(np.nan, 0)

c8 = num_labels(c8)

print(c8.shape)
c8.head()

In [None]:
#Import Consultant no.9 dataset

c9 = pd.read_csv('./p09.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c9 = c9.drop(columns = cols)
c9 = c9.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c9['Adrenaline'] = c9['Adrenaline'].replace(np.nan, 0)
c9['Noradrenaline'] = c9['Noradrenaline'].replace(np.nan, 0)

c9 = num_labels(c9)

print(c9.shape)
c9.head()

In [None]:
#Import Consultant no.10 dataset

c10 = pd.read_csv('./p10.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c10 = c10.drop(columns = cols)
c10 = c10.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c10['Adrenaline'] = c10['Adrenaline'].replace(np.nan, 0)
c10['Noradrenaline'] = c10['Noradrenaline'].replace(np.nan, 0)

c10 = num_labels(c10)

print(c10.shape)
c10.head()

In [None]:
#Import Consultant no.11 dataset

c11 = pd.read_excel('./p11.xlsx').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c11 = c11.drop(columns = cols)
c11 = c11.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
c11['Adrenaline'] = c11['Adrenaline'].replace(np.nan, 0)
c11['Noradrenaline'] = c11['Noradrenaline'].replace(np.nan, 0)

c11['Annotation'] = c11['Annotation'].str.upper()

c11 = num_labels(c11)

print(c11.shape)
c11.head()

In [None]:
#Import Majority MV Consensus Dataset
##See jupyter notebook 'npjDM-MV_Consensus_Dataset' for steps to create this Majority MV Consensus Dataset

mv = pd.read_csv('MV-Consensus-Dataset.csv')

mv = mv.drop('Unnamed: 0',axis=1)
mv = mv.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
mv['Adrenaline'] = mv['Adrenaline'].replace(np.nan, 0)
mv['Noradrenaline'] = mv['Noradrenaline'].replace(np.nan, 0)

mv = num_labels(mv)

print(mv.shape)
mv.head()

In [None]:
#TMV
##Create a TMV dataset by taking the majority-vote labels across only the expert annotated datasets which generate models that have high internal validation Performance (i.e., where internal F1 >= 0.7).
##See jupyter notebook 'npjDM-IntVal-Top_Models' for steps to find top Performing models
##Top Performaing models within internal validation: C2, C4, C8

c2_ann = pd.read_csv('./p02.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c2_ann = c2_ann.drop(columns = cols)
c2_ann = c2_ann.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

c4_ann = pd.read_excel('./p04.xlsx').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c4_ann = c4_ann.drop(columns = cols)
c4_ann = c4_ann.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

c8_ann = pd.read_csv('./p08.csv').sort_values(by = ['PseudoID'], ascending=[True])
cols = ['Dobutamine','Time','Bckgrnd','PseudoID','Line of Selected Timepoint']
c8_ann = c8_ann.drop(columns = cols)
c8_ann = c8_ann.rename(columns={'Mean': 'MAP'}) #rename Mean to MAP

cols = ['Adrenaline','Noradrenaline','FiO2','SpO2','MAP','HR']
ann_top = c2_ann.merge(c4_ann,on=cols).merge(c8_ann,on=cols)

ann_top.columns = ['Adrenaline','Noradrenaline','FiO2','SpO2','MAP','HR', 'c2_ann', 'c4_ann', 'c8_ann']

colsb = ['Adrenaline', 'Noradrenaline','FiO2','SpO2','MAP','HR']
ann_top.drop(colsb,axis=1,inplace=True)

ann_top['Annotation']= ann_top.mode(axis=1)[0]
colsc = ['c2_ann', 'c4_ann','c8_ann']
ann_top.drop(colsc,axis=1,inplace=True)

colsd = ['Adrenaline','Noradrenaline','FiO2','SpO2','MAP','HR']
tmv = c2_ann.merge(c4_ann,on=colsd).merge(c8_ann,on=colsd)
tmv.columns = ['Adrenaline','Noradrenaline','FiO2','SpO2','MAP','HR', 'c2_ann', 'c4_ann', 'c8_ann']

tmv = pd.concat([tmv,ann_top],axis=1)
colse = ['c2_ann', 'c4_ann','c8_ann']
tmv.drop(colse,axis=1,inplace=True)

#Replace null with 0 in drug fields (as blank value indicates value=0, as confirmed by Prof Sim)
tmv['Adrenaline'] = tmv['Adrenaline'].replace(np.nan, 0)
tmv['Noradrenaline'] = tmv['Noradrenaline'].replace(np.nan, 0)

tmv = num_labels(tmv)

print(tmv.shape)
tmv.head()

# 2. Internal Validation Experiment

In [None]:
#Define Parameter Grid for hyperparameter optimisation
##Create a dictionary with all SVM parameter options 

parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
              'gamma': ['scale', 'auto']}

In [None]:
#Define Function - svm Model Evaluation via 5-fold CV

def do_cv_learning_svm(X, y, verbose=False, do_scale=False, random_state=1):
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    f1s = []

    if do_scale:
        sc = StandardScaler()
        X = sc.fit_transform(X)
        
    for i, (train,test) in enumerate(cv.split(X,y)):
        gcsv = GridSearchCV(svm.SVC(random_state=1), 
                            param_grid=parameters, 
                            cv=5, 
                            scoring='f1_micro')
        grid_result = gcsv.fit(X[train],y[train])
        best_params = grid_result.best_params_
        if verbose:
            print('fold', i,'best_params', best_params)
        clf = grid_result.best_estimator_
        f1 = metrics.f1_score(y[test], clf.predict(X[test]), average='micro')
        f1s.append(f1)
    
    ##Performance metrics 
    dfsvm_multi_f1data = [['ann', 'multi', 'F1_micro', np.mean(f1s), np.std(f1s)]]

    ##print data as DF
    dfsvm_multi_f1data = pd.DataFrame(data=dfsvm_multi_f1data)
    dfsvm_multi_f1data.columns = ['Annotator','Model','Optimisation','F1_micro','S.D.']
    
    return dfsvm_multi_f1data

In [None]:
#Define Function - Find highest Performing model after 5-fold CV

def model_opt_svm(X, y, verbose=False, do_scale=False, random_state=1):
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    f1s = []
    models = []

    if do_scale:
        sc = StandardScaler()
        X = sc.fit_transform(X)
        
    for i, (train,test) in enumerate(cv.split(X,y)):
        gcsv = GridSearchCV(svm.SVC(random_state=1), 
                            param_grid=parameters, 
                            cv=5, 
                            scoring='f1_micro')
        grid_result = gcsv.fit(X[train],y[train])
        best_params = grid_result.best_params_
        if verbose:
            print('fold', i,'best_params', best_params)
        clf = grid_result.best_estimator_
        f1 = metrics.f1_score(y[test], clf.predict(X[test]), average='micro')
        f1s.append(f1)
        models.append(grid_result.best_estimator_)
        
    #find opt model
    df_multi_opt = [f1s, models]
    max_val = max(df_multi_opt[0])
    max_index = df_multi_opt[0].index(max_val)
    opt_model = df_multi_opt[1][max_index]
    
    return opt_model

In [None]:
#C1 - IntVal

array = c1.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c1svm_multi_f1data = do_cv_learning_svm(X,y)
c1svm_multi_f1data['Annotator'] = 'C1'

#Find Opt model
c1svm_multi_opt = model_opt_svm(X,y)

print(c1svm_multi_opt)
c1svm_multi_f1data

In [None]:
#C2 - IntVal

array = c2.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c2svm_multi_f1data = do_cv_learning_svm(X,y)
c2svm_multi_f1data['Annotator'] = 'C2'

#Find Opt model
c2svm_multi_opt = model_opt_svm(X,y)

print(c2svm_multi_opt)
c2svm_multi_f1data

In [None]:
#C3 - IntVal

array = c3.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c3svm_multi_f1data = do_cv_learning_svm(X,y)
c3svm_multi_f1data['Annotator'] = 'C3'

#Find Opt model
c3svm_multi_opt = model_opt_svm(X,y)

print(c3svm_multi_opt)
c3svm_multi_f1data

In [None]:
#C4 - IntVal

array = c4.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c4svm_multi_f1data = do_cv_learning_svm(X,y)
c4svm_multi_f1data['Annotator'] = 'C4'

#Find Opt model
c4svm_multi_opt = model_opt_svm(X,y)

print(c4svm_multi_opt)
c4svm_multi_f1data

In [None]:
#C5 - IntVal

array = c5.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c5svm_multi_f1data = do_cv_learning_svm(X,y)
c5svm_multi_f1data['Annotator'] = 'C5'

#Find Opt model
c5svm_multi_opt = model_opt_svm(X,y)

print(c5svm_multi_opt)
c5svm_multi_f1data

In [None]:
#C6 - IntVal

array = c6.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c6svm_multi_f1data = do_cv_learning_svm(X,y)
c6svm_multi_f1data['Annotator'] = 'C6'

#Find Opt model
c6svm_multi_opt = model_opt_svm(X,y)

print(c6svm_multi_opt)
c6svm_multi_f1data

In [None]:
#C7 - IntVal

array = c7.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c7svm_multi_f1data = do_cv_learning_svm(X,y)
c7svm_multi_f1data['Annotator'] = 'C7'

#Find Opt model
c7svm_multi_opt = model_opt_svm(X,y)

print(c7svm_multi_opt)
c7svm_multi_f1data

In [None]:
#C8 - IntVal

array = c8.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c8svm_multi_f1data = do_cv_learning_svm(X,y)
c8svm_multi_f1data['Annotator'] = 'C8'

#Find Opt model
c8svm_multi_opt = model_opt_svm(X,y)

print(c8svm_multi_opt)
c8svm_multi_f1data

In [None]:
#C9 - IntVal

array = c9.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c9svm_multi_f1data = do_cv_learning_svm(X,y)
c9svm_multi_f1data['Annotator'] = 'C9'

#Find Opt model
c9svm_multi_opt = model_opt_svm(X,y)

print(c9svm_multi_opt)
c9svm_multi_f1data

In [None]:
#C10 - IntVal

array = c10.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c10svm_multi_f1data = do_cv_learning_svm(X,y)
c10svm_multi_f1data['Annotator'] = 'C10'

#Find Opt model
c10svm_multi_opt = model_opt_svm(X,y)

print(c10svm_multi_opt)
c10svm_multi_f1data

In [None]:
#C11 - IntVal

array = c11.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
c11svm_multi_f1data = do_cv_learning_svm(X,y)
c11svm_multi_f1data['Annotator'] = 'C11'

#Find Opt model
c11svm_multi_opt = model_opt_svm(X,y)

print(c11svm_multi_opt)
c11svm_multi_f1data

In [None]:
#MV - IntVal

array = mv.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
mvsvm_multi_f1data = do_cv_learning_svm(X,y)
mvsvm_multi_f1data['Annotator'] = 'MV'

#Find Opt model
mvsvm_multi_opt = model_opt_svm(X,y)

print(mvsvm_multi_opt)
mvsvm_multi_f1data

In [None]:
#TMV - IntVal

array = tmv.to_numpy()
X = array[:,0:6]  
y = array[:,7]  

X = X.astype(float) 
y = y.astype(int) 

le = LabelEncoder()
y = le.fit_transform(y)

print(X.shape)
print(y.shape)
print(le.classes_)

#5-fold CV Model Eval
tmvsvm_multi_f1data = do_cv_learning_svm(X,y)
tmvsvm_multi_f1data['Annotator'] = 'TMV'

#Find Opt model
tmvsvm_multi_opt = model_opt_svm(X,y)

print(tmvsvm_multi_opt)
tmvsvm_multi_f1data

In [None]:
#Internal Validation Performances - Summary

frames = [c1svm_multi_f1data, c2svm_multi_f1data, c3svm_multi_f1data, c4svm_multi_f1data, 
          c5svm_multi_f1data, c6svm_multi_f1data, c7svm_multi_f1data, c8svm_multi_f1data,
          c9svm_multi_f1data, c10svm_multi_f1data, c11svm_multi_f1data, mvsvm_multi_f1data,
          tmvsvm_multi_f1data]

multi_int = pd.concat(frames)
print(multi_int.shape)
multi_int

In [None]:
#Plot chart - Internal Validation F1 (micro)

plt.style.use('ggplot')

#Define x and y data
x1 = multi_int['Annotator']
y1 = multi_int['F1_micro']

#Plot chart data
plt.figure(figsize=(8,2.5))
plt.plot(x1, y1, color='#1F57C8', marker='o', linestyle="solid", label='Multi')

plt.ylim([0.0,1.1])
plt.yticks(np.arange(0.0,1.01, 0.2))

#Add title and labels
plt.title('Internal Validation: Multiclass - svm', fontsize=14)
plt.xlabel('Annotator', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.ylabel('F1_micro', fontsize=14)
plt.grid(True)
plt.tight_layout()

plt.show()

# 3. External Validation Experiment

## 3.1 Define HiRID External Validation Dataset

In [None]:
#Import HiRID 'Patient' table (contains discharge_status info)

pat = pd.read_sql_query("SELECT * FROM hirid.patient", conn)

pat.to_csv('patient_table.csv')

print(pat.shape)
pat.head()

#33,905 records

In [None]:
#Import HiRID Validation Dataset - data for patients 1hr before discharge/death
##See jupyter notebook 'npjDM-HiRID_ExtVal_Dataset' to see steps on creating this HiRID External Validation Dataset

params1hr = pd.read_csv("HiRID_extval_params1hr.csv")
params1hr.drop('Unnamed: 0', axis=1, inplace=True)

params1hr['binary_status'] = np.where(params1hr['discharge_status']== 'alive', 0, 4)

print(params1hr.shape)
params1hr.head()

In [None]:
#Check dishcarge status classes are balanced

params1hr.discharge_status.value_counts()
params1hr.binary_status.value_counts()

In [None]:
#Define hirid validation dataset

array = params1hr.to_numpy()
X_test = array[:,3:9]  
y_test = array[:,12]  

X_test = X_test.astype(float) 

print(X_test.shape)
print(y_test.shape)

In [None]:
X_test

In [None]:
y_test

## 3.2 Run QEUH models on HiRID External Validation Dataset

In [None]:
#C1 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c1svm_multi_opt.predict(X_test), average='micro')
c1svm_multi_ext  = [['C1', 'multi', 'F1_micro', f1]]

c1svm_multi_ext = pd.DataFrame(data=c1svm_multi_ext)
c1svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c1svm_multi_ext

In [None]:
#C2 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c2svm_multi_opt.predict(X_test), average='micro')
c2svm_multi_ext  = [['C2', 'multi', 'F1_micro', f1]]

c2svm_multi_ext = pd.DataFrame(data=c2svm_multi_ext)
c2svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c2svm_multi_ext

In [None]:
#C3 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c3svm_multi_opt.predict(X_test), average='micro')
c3svm_multi_ext  = [['C3', 'multi', 'F1_micro', f1]]

##print data as DF
c3svm_multi_ext = pd.DataFrame(data=c3svm_multi_ext)
c3svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c3svm_multi_ext

In [None]:
#C4 - HiRID Ext val  

f1 = metrics.f1_score(list(y_test), c4svm_multi_opt.predict(X_test), average='micro')
c4svm_multi_ext  = [['C4', 'multi', 'F1_micro', f1]]

c4svm_multi_ext = pd.DataFrame(data=c4svm_multi_ext)
c4svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c4svm_multi_ext

In [None]:
#C5 - HiRID Ext val  

f1 = metrics.f1_score(list(y_test), c5svm_multi_opt.predict(X_test), average='micro')
c5svm_multi_ext  = [['C5', 'multi', 'F1_micro', f1]]

c5svm_multi_ext = pd.DataFrame(data=c5svm_multi_ext)
c5svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c5svm_multi_ext

In [None]:
#C6 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c6svm_multi_opt.predict(X_test), average='micro')
c6svm_multi_ext  = [['C6', 'multi', 'F1_micro', f1]]

c6svm_multi_ext = pd.DataFrame(data=c6svm_multi_ext)
c6svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c6svm_multi_ext

In [None]:
#C7 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c7svm_multi_opt.predict(X_test), average='micro')
c7svm_multi_ext  = [['C7', 'multi', 'F1_micro', f1]]

c7svm_multi_ext = pd.DataFrame(data=c7svm_multi_ext)
c7svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c7svm_multi_ext

In [None]:
#C8 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c8svm_multi_opt.predict(X_test), average='micro')
c8svm_multi_ext  = [['C8', 'multi', 'F1_micro', f1]]

c8svm_multi_ext = pd.DataFrame(data=c8svm_multi_ext)
c8svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c8svm_multi_ext

In [None]:
#C9 - HiRID Ext val  

f1 = metrics.f1_score(list(y_test), c9svm_multi_opt.predict(X_test), average='micro')
c9svm_multi_ext  = [['C9', 'multi', 'F1_micro', f1]]

c9svm_multi_ext = pd.DataFrame(data=c9svm_multi_ext)
c9svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c9svm_multi_ext

In [None]:
#C10 - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c10svm_multi_opt.predict(X_test), average='micro')
c10svm_multi_ext  = [['C10', 'multi', 'F1_micro', f1]]

c10svm_multi_ext = pd.DataFrame(data=c10svm_multi_ext)
c10svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c10svm_multi_ext

In [None]:
#C11- HiRID Ext val 

f1 = metrics.f1_score(list(y_test), c11svm_multi_opt.predict(X_test), average='micro')
c11svm_multi_ext  = [['C11', 'multi', 'F1_micro', f1]]

c11svm_multi_ext = pd.DataFrame(data=c11svm_multi_ext)
c11svm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
c11svm_multi_ext

In [None]:
#MV - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), mvsvm_multi_opt.predict(X_test), average='micro')
mvsvm_multi_ext  = [['MV', 'multi', 'F1_micro', f1]]

mvsvm_multi_ext = pd.DataFrame(data=mvsvm_multi_ext)
mvsvm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
mvsvm_multi_ext

In [None]:
#TMV - HiRID Ext val 

f1 = metrics.f1_score(list(y_test), tmvsvm_multi_opt.predict(X_test), average='micro')
tmvsvm_multi_ext  = [['TMV', 'multi', 'F1_micro', f1]]

tmvsvm_multi_ext = pd.DataFrame(data=tmvsvm_multi_ext)
tmvsvm_multi_ext.columns = ['Annotator','Model','Optimisation','F1_micro']
tmvsvm_multi_ext

In [None]:
#External Validation - Summary

frames = [c1svm_multi_ext, c2svm_multi_ext, c3svm_multi_ext, c4svm_multi_ext, 
          c5svm_multi_ext, c6svm_multi_ext, c7svm_multi_ext, c8svm_multi_ext,
          c9svm_multi_ext, c10svm_multi_ext, c11svm_multi_ext, mvsvm_multi_ext,
          tmvsvm_multi_ext]

multi_ext = pd.concat(frames)

print(multi_ext.shape)
multi_ext

In [None]:
#Plot chart - External Validation

plt.style.use('ggplot')

#Define x and y data
x1 = multi_ext_ann['Annotator']
y1 = multi_ext_ann['F1_micro']
mv = multi_ext_mvs.iloc[0,3]
tmv = multi_ext_mvs.iloc[1,3]

#Plot chart data
plt.figure(figsize=(8.5,4))
plt.plot(x1, y1, color='#1F57C8', marker='o', linestyle="solid")
plt.ylim([0.0,0.61])
plt.yticks(np.arange(0.0,0.61, 0.1))
plt.axhline(y=mv, color='#DA4802', linestyle='-', label = 'Majority Vote (MV)')
plt.axhline(y=tmv, color='#65C314', linestyle='-', label = 'Top Majority Vote (TMV)')

#Add title and labels
plt.title('Random Forest External Validation Performance', fontsize=14)
plt.xlabel('Annotator', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.ylabel('F1 micro', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12, loc='upper center', bbox_to_anchor=(0.5, -0.4), fancybox=True, shadow=True, ncol=2)
plt.tight_layout()

plt.show()