In [1]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from eipy.ei import EnsembleIntegration
from eipy.additional_ensembles import MeanAggregation, CES
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle as pkl
import os
from sklearn.impute import KNNImputer

In [2]:
data_csvs = "/home/opc/block_vol/COVID-19 data/Modalities"
modalities = {}
for file_name in os.listdir(data_csvs):
        file_path = os.path.join(data_csvs, file_name)
        modality = os.path.splitext(file_name)[0]

        data = pd.read_csv(file_path)
        data = data.drop(columns=["NEW_MASKED_MRN"])
        modalities[modality] = data
y = pd.read_csv("/home/opc/block_vol/COVID-19 data/outcome.csv")

In [3]:
y = y.drop(columns=["NEW_MASKED_MRN", "DECEASED_in_0-5_DAYS", "DECEASED_in_0-7_DAYS", "DECEASED_in_0-10_DAYS", "DECEASED_after_5_DAYS"], axis=1)
y['DECEASED_INDICATOR'] = y['DECEASED_INDICATOR'].replace({0: 1, 1: 0})
y = y.rename(columns={'DECEASED_INDICATOR': 'SURVIVED_INDICATOR'})
y=y.fillna(0)

In [4]:
row_sums = y.sum(axis=1)
bad_rows = y[(row_sums != 1)].index
bad_rows
y = y[row_sums == 1]

In [5]:
for k,v in modalities.items():
    modalities[k] = v[~v.index.isin(bad_rows)]

In [6]:
imputer = KNNImputer(n_neighbors=6)
for k,v in modalities.items():
    modalities[k] = pd.DataFrame(imputer.fit_transform(v), columns=v.columns)

In [7]:
for k,v in modalities.items():
    if v.isna().any().any():
        print(f"There are NaN values in {k}")
    else:
        print(f"ALL G IN {k}")

ALL G IN labs
ALL G IN admission
ALL G IN comorbidities
ALL G IN vitals


In [8]:
for i, col in enumerate(y.columns):
    y = y.rename(columns={col: i})

In [9]:
y["labels"] = y.idxmax(axis=1)

In [10]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(multi_class="auto", solver="lbfgs"),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True)
}
meta_predictors = {
                    'Mean' : MeanAggregation(),
                    'CES' : CES(),
                    'S.ADAB': AdaBoostClassifier(),
                    'S.XGB': XGBClassifier(),
                    'S.DT': DecisionTreeClassifier(),
                    "S.RF": RandomForestClassifier(),
                    'S.GB': GradientBoostingClassifier(),
                    'S.KNN': KNeighborsClassifier(),
                    'S.LR': LogisticRegression(),
                    'S.NB': GaussianNB(),
                    'S.MLP': MLPClassifier(),
                    'S.SVM': SVC(probability=True),
}

In [11]:
labels = y["labels"]

In [12]:
labs_train, labs_test, y_train, y_test = train_test_split(modalities["labs"],labels, random_state=3, test_size=0.25, stratify=y)
admission_train, admission_test, _,_ = train_test_split(modalities["admission"],labels, random_state=3, test_size=0.25, stratify=y)
comorbidities_train, comorbidities_test, _,_ = train_test_split(modalities["comorbidities"],labels, random_state=3, test_size=0.25, stratify=y)
vitals_train, vitals_test, _,_ = train_test_split(modalities["vitals"],labels, random_state=3, test_size=0.25, stratify=y)

In [13]:
X_train = {
    "labs": labs_train,
    "admission": admission_train,
    "comorbities": comorbidities_train,
    "vitals" : vitals_train
}
X_test = {
    "labs": labs_test,
    "admission": admission_test,
    "comorbities": comorbidities_test,
    "vitals" : vitals_test
}

In [14]:
class_EIs = []
for label in np.unique(labels):
    encoding_dict = {outcome: 0 for outcome in labels}
    encoding_dict[label] = 1
    y_train_label = y_train.map(encoding_dict)
    y_test_label = y_test.map(encoding_dict)
    EI = EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy="undersampling", #change sampling strategies
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        random_state=38,
                        project_name=f"COVID {label}",
                        model_building=True,
                        )
    EI.fit_base(X_train, y_train_label, base_predictors=base_predictors)
    EI.fit_meta(meta_predictors=meta_predictors)
    class_EIs.append(EI)

Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |          |  0%

Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


In [27]:
class_EIs[1].meta_summary["metrics"]

Unnamed: 0,Mean,CES,S.ADAB,S.XGB,S.DT,S.RF,S.GB,S.KNN,S.LR,S.NB,S.MLP,S.SVM
fmax (minority),0.32,0.254107,0.281016,0.27451,0.213457,0.307414,0.312812,0.257325,0.324921,0.332779,0.308642,0.227882
f (majority),0.957141,0.891127,0.925989,0.91974,0.949681,0.942101,0.93711,0.908664,0.934496,0.938937,0.931288,0.910308
AUC,0.849634,0.798922,0.801936,0.819137,0.592325,0.828647,0.840996,0.699774,0.860438,0.854752,0.842243,0.723112
max MCC,0.287728,0.241187,0.241176,0.241769,0.165199,0.277635,0.283838,0.23245,0.308175,0.304364,0.280491,0.186543


In [28]:
metrics = ["fmax (minority)", "f (majority)", "AUC", "max MCC"]
metric_predictions = []
for metric in metrics:
    metric_class_predictions = []
    for ensemble in class_EIs:
        preferred_class_model = ensemble.meta_summary["metrics"].loc[metric].idxmax()
        metric_class_predictions.append(ensemble.predict(X_dict=X_test, meta_model_key=preferred_class_model))
    metric_predictions.append(metric_class_predictions)

In [30]:
metric_predictions = np.array(metric_predictions)
metric_predictions

array([[[9.50000000e-01, 8.10000000e-01, 3.80000000e-01, ...,
         8.30000000e-01, 8.80000000e-01, 8.60000000e-01],
        [2.98968093e-19, 1.06259102e-10, 4.87224386e-02, ...,
         3.78958153e-11, 2.24547439e-10, 6.82649849e-12],
        [1.83329525e-05, 3.60095372e-05, 9.99999797e-01, ...,
         1.70981560e-03, 1.61067047e-06, 8.46504832e-03],
        [4.87656781e-03, 4.79104073e-03, 3.74552195e-02, ...,
         8.06705016e-02, 9.11710391e-03, 7.05308423e-03],
        [1.08804558e-02, 1.88387677e-02, 8.58915827e-02, ...,
         9.70064988e-03, 4.41971295e-02, 2.14357901e-02],
        [3.23249343e-06, 7.83270434e-06, 4.52040888e-01, ...,
         1.17977956e-01, 4.79948377e-04, 1.39027502e-02]],

       [[1.00000000e+00, 9.99999963e-01, 5.77192359e-05, ...,
         9.99087988e-01, 9.99991786e-01, 9.41268972e-01],
        [1.95327363e-01, 3.24580785e-01, 4.75584223e-01, ...,
         3.30874628e-01, 3.46120841e-01, 3.39195611e-01],
        [0.00000000e+00, 0.00000000e+0

In [52]:
grouped_metric_predictions = []
for i in range(metric_predictions.shape[0]):
    grouped_metric_predictions.append(metric_predictions[i].T)
grouped_metric_predictions=np.array(grouped_metric_predictions)

In [56]:
grouped_metric_predictions[0].shape

(1195, 6)

In [57]:
metric_preds_dict = {}
for i in range(grouped_metric_predictions.shape[0]):
    argmax_values = np.argmax(grouped_metric_predictions[i], axis=1)
    metric_preds_dict[metrics[i]] = argmax_values.reshape(-1, 1)

In [60]:
metric_preds_dict["AUC"]

array([0, 1, 2, 5])

In [61]:
def accuracy(preds, test):
    return sum([1*(y==y_hat)+0*(y!=y_hat) for y,y_hat in list(zip(test, preds))])/len(test)

In [62]:
for k,v in metric_preds_dict.items():
    print(f"{k}, {accuracy(v,y_test)}")

fmax (minority), [0.59497908]
f (majority), [0.5790795]
AUC, [0.72803347]
max MCC, [0.72552301]
