In [2]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from eipy.ei import EnsembleIntegration
from eipy.additional_ensembles import MeanAggregation, CES
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle as pkl
import os
from sklearn.impute import KNNImputer

In [3]:
data_csvs = "/home/opc/block_vol/COVID-19 data/Modalities"
modalities = {}
for file_name in os.listdir(data_csvs):
        file_path = os.path.join(data_csvs, file_name)
        modality = os.path.splitext(file_name)[0]

        data = pd.read_csv(file_path)
        data = data.drop(columns=["NEW_MASKED_MRN"])
        modalities[modality] = data
y = pd.read_csv("/home/opc/block_vol/COVID-19 data/outcome.csv")

In [4]:
y = y.drop(columns=["NEW_MASKED_MRN", "DECEASED_in_0-5_DAYS", "DECEASED_in_0-7_DAYS", "DECEASED_in_0-10_DAYS", "DECEASED_after_5_DAYS"], axis=1)
y['DECEASED_INDICATOR'] = y['DECEASED_INDICATOR'].replace({0: 1, 1: 0})
y = y.rename(columns={'DECEASED_INDICATOR': 'SURVIVED_INDICATOR'})
y=y.fillna(0)

In [5]:
row_sums = y.sum(axis=1)
bad_rows = y[(row_sums != 1)].index
bad_rows
y = y[row_sums == 1]

In [6]:
for k,v in modalities.items():
    modalities[k] = v[~v.index.isin(bad_rows)]

In [7]:
imputer = KNNImputer(n_neighbors=6)
for k,v in modalities.items():
    modalities[k] = pd.DataFrame(imputer.fit_transform(v), columns=v.columns)

In [8]:
for k,v in modalities.items():
    if v.isna().any().any():
        print(f"There are NaN values in {k}")
    else:
        print(f"ALL G IN {k}")

ALL G IN labs
ALL G IN admission
ALL G IN comorbidities
ALL G IN vitals


In [9]:
for i, col in enumerate(y.columns):
    y = y.rename(columns={col: i})

In [10]:
y["labels"] = y.idxmax(axis=1)

In [11]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(multi_class="auto", solver="lbfgs"),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True)
}
meta_predictors = {
                    'Mean' : MeanAggregation(),
                    'CES' : CES(),
                    'S.ADAB': AdaBoostClassifier(),
                    'S.XGB': XGBClassifier(),
                    'S.DT': DecisionTreeClassifier(),
                    "S.RF": RandomForestClassifier(),
                    'S.GB': GradientBoostingClassifier(),
                    'S.KNN': KNeighborsClassifier(),
                    'S.LR': LogisticRegression(),
                    'S.NB': GaussianNB(),
                    'S.MLP': MLPClassifier(),
                    'S.SVM': SVC(probability=True),
}

In [12]:
labels = y["labels"]

In [13]:
labs_train, labs_test, y_train, y_test = train_test_split(modalities["labs"],labels, random_state=3, test_size=0.25, stratify=y)
admission_train, admission_test, _,_ = train_test_split(modalities["admission"],labels, random_state=3, test_size=0.25, stratify=y)
comorbidities_train, comorbidities_test, _,_ = train_test_split(modalities["comorbidities"],labels, random_state=3, test_size=0.25, stratify=y)
vitals_train, vitals_test, _,_ = train_test_split(modalities["vitals"],labels, random_state=3, test_size=0.25, stratify=y)

In [14]:
X_train = {
    "labs": labs_train,
    "admission": admission_train,
    "comorbities": comorbidities_train,
    "vitals" : vitals_train
}
X_test = {
    "labs": labs_test,
    "admission": admission_test,
    "comorbities": comorbidities_test,
    "vitals" : vitals_test
}

In [15]:
class_EIs = []
for label in np.unique(labels):
    encoding_dict = {outcome: 0 for outcome in labels}
    encoding_dict[label] = 1
    y_train_label = y_train.map(encoding_dict)
    y_test_label = y_test.map(encoding_dict)
    EI = EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy="undersampling", #change sampling strategies
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        random_state=38,
                        project_name=f"COVID {label}",
                        model_building=True,
                        )
    EI.fit_base(X_train, y_train_label, base_predictors=base_predictors)
    EI.fit_meta(meta_predictors=meta_predictors)
    class_EIs.append(EI)

Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |          |  0%

Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


In [16]:
class_EIs[1].meta_summary["metrics"]

Unnamed: 0,Mean,CES,S.ADAB,S.XGB,S.DT,S.RF,S.GB,S.KNN,S.LR,S.NB,S.MLP,S.SVM
fmax (minority),0.32,0.254107,0.281016,0.27451,0.213457,0.307414,0.312812,0.257325,0.324921,0.332779,0.308642,0.227882
f (majority),0.957141,0.891127,0.925989,0.91974,0.949681,0.942101,0.93711,0.908664,0.934496,0.938937,0.931288,0.910308
AUC,0.849634,0.798922,0.801936,0.819137,0.592325,0.828647,0.840996,0.699774,0.860438,0.854752,0.842243,0.723112
max MCC,0.287728,0.241187,0.241176,0.241769,0.165199,0.277635,0.283838,0.23245,0.308175,0.304364,0.280491,0.186543


In [25]:
binary_metrics = list(class_EIs[0].meta_summary["metrics"].index)
predictions_for_preferred_metric = {}
for metric in binary_metrics:
    preferred_models = []
    class_predictions = []
    for c, ensemble in enumerate(class_EIs):
        preferred_class_model = ensemble.meta_summary["metrics"].loc[metric].idxmax()
        preferred_models.append(preferred_class_model)
        class_predictions.append(ensemble.predict(X_test, meta_model_key=preferred_class_model))
    all_predictions = list(zip(*class_predictions))
    predictions_for_preferred_metric[metric] = [preferred_models, all_predictions]

In [28]:
def confidence(list_of_tups, approach="max"):
    if approach == "max":
        return [tup.index(max(tup)) for tup in list_of_tups]

In [29]:
final_predictions = {}
for k, v in predictions_for_preferred_metric.items():
    final_predictions[k] = confidence(v[1])

final_predictions

{'fmax (minority)': [0,
  0,
  2,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  5,
  5,
  0,
  0,
  2,
  5,
  5,
  1,
  0,
  0,
  1,
  0,
  5,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  2,
  0,
  2,
  0,
  5,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  5,
  0,
  0,
  5,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  5,
  2,
  2,
  2,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  1,
  5,
  0,
  0,
  1,
  0,
  0,
  2,
  0,
  0,
  0,
  2,
  5,
  0,
  0,
  0,
  1,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  5,
  0,
  0,
  5,
  0,
  5,
  2,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  2,
  1,
  5,
  5,
  2,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  2,
  2,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  5,
  0,
  1,
  2,
  0,
  0,
  5,
  2,
  0,
  0,
  2,
  0,
  1,
  2,
  0,
  1,
  0,
  1,
  5,
  0,
  2,
  0,
  1,
  5,
  0,
 

In [30]:
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score
)

In [32]:
performance_frame = pd.DataFrame()
multi_metrics = ["accuracy", "precision", "recall", "f1"]
import eipy.utils as ut
for bin_metric, y_pred in final_predictions.items():
    scores =[]
    for multi_metric in multi_metrics:
        if multi_metric == "accuracy":
            scores.append(sum([1*(y==y_hat)+0*(y!=y_hat) for y,y_hat in list(zip(y_test, y_pred))])/len(y_test))
        elif multi_metric == "precision":
            scores.append(precision_score(y_test, y_pred, average='macro'))
        elif multi_metric == "recall":
            scores.append(recall_score(y_test, y_pred, average='macro'))
        elif multi_metric == "f1":
            scores.append(f1_score(y_test, y_pred, average='macro'))
    performance_frame[bin_metric] = scores

In [33]:
performance_frame = performance_frame.set_index(pd.Index(multi_metrics))
performance_frame

Unnamed: 0,fmax (minority),f (majority),AUC,max MCC
accuracy,0.594979,0.579079,0.728033,0.725523
precision,0.22746,0.180599,0.242838,0.209591
recall,0.296788,0.272742,0.209961,0.202185
f1,0.24238,0.184496,0.206483,0.195173


In [34]:
import pickle as pkl
for k,v in encoding_dict.items():
    with open(f"/home/opc/eipy/eipy/saved_test_EIs/COVID/{k}.pkl", "wb") as file:
        pkl.dump(file=file, obj=class_EIs[v])

In [35]:
with open(f"/home/opc/eipy/eipy/saved_test_EIs/COVID/performance.pkl", "wb") as file:
        pkl.dump(file=file, obj=performance_frame)