In [1]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pandas as pd
import eipy.ei as e
from eipy.additional_ensembles import MeanAggregation, CES

In [2]:
from eipy.metrics import fmax_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef

metrics = {
            'f_max': fmax_score,
            'auc': roc_auc_score,
            'mcc': matthews_corrcoef
            }

In [3]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True),
}

In [4]:
import pickle as pkl
with open("/home/opc/eipy/tadpole/tadpole_data_time_imptn_norm_v1.pickle", "rb") as file:
    data = pkl.load(file)
with open("/home/opc/eipy/tadpole/tadpole_labels_time_imptn_norm_v1.pickle", "rb") as file:
    labels = pkl.load(file)

In [5]:
data = [data[k] for k in data.keys()]

In [6]:
for k,v in labels.items():
    labels[k] = v.reset_index(drop=True)

In [7]:
#intermediate transformation to make sure labels are ordered correctly in time
labels = pd.DataFrame(labels)

labels = labels.to_numpy()

In [8]:
import numpy as np
encoding_dict = {'NL': 0, 'MCI': 1, 'Dementia': 2}

# Use numpy.vectorize with a lambda function to apply the encoding
labels = np.vectorize(lambda x: encoding_dict[x])(labels)

In [9]:
labels.shape

(702, 5)

In [275]:
for i in range(5):
    print(pd.Series(labels[:,i]).value_counts())

1    502
0    200
Name: count, dtype: int64
1    480
0    195
2     27
Name: count, dtype: int64
1    451
0    192
2     59
Name: count, dtype: int64
1    378
0    188
2    136
Name: count, dtype: int64
1    348
2    178
0    176
Name: count, dtype: int64


In [10]:
meta_data = []
for t in range(len(data)):
    #time dependent data splitting
    X_train_timestep = data[t]
    labels_at_timestep = labels[:, t]
    EI_for_timestep = e.EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy=None,
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        metrics=metrics,
                        random_state=38,
                        project_name=f"time step {t}",
                        model_building=False,
                        )
    print(f"generating metadata for timestep {t}")
    EI_for_timestep.fit_base(X_train_timestep, labels_at_timestep)
    meta_data.append([EI_for_timestep.ensemble_training_data, EI_for_timestep.ensemble_test_data, EI_for_timestep.ensemble_training_data_final, EI_for_timestep.base_summary])


generating metadata for timestep 0
Training base predictors on Main cognitive tests...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI volumes...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Demo, APOE4 & others...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (WM Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (Cortical Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Surface Area...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Average...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Standard Deviation...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 1
Training base predictors on Main cognitive tests...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI volumes...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Demo, APOE4 & others...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (WM Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (Cortical Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Surface Area...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Average...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Standard Deviation...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 2
Training base predictors on Main cognitive tests...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI volumes...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Demo, APOE4 & others...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (WM Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (Cortical Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Surface Area...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Average...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Standard Deviation...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 3
Training base predictors on Main cognitive tests...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI volumes...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Demo, APOE4 & others...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (WM Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (Cortical Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Surface Area...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Average...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Standard Deviation...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 4
Training base predictors on Main cognitive tests...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI volumes...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Demo, APOE4 & others...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (WM Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Volume (Cortical Parcellation)...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Surface Area...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Average...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on MRI ROI: Cortical Thickness Standard Deviation...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%






In [290]:
RNN_training_data = [[dfs[0][i] for dfs in meta_data] for i in range(5)]
RNN_test_data = [[dfs[1][i] for dfs in meta_data] for i in range(5)]
RNN_training_data_final = [df[2] for df in meta_data]

### make first time point in meta-data multiclass

In [291]:
def get_column_names(frame):
    column_names = []
    for i in range(frame.columns.nlevels):
        if i == 0:
            column_names.append(frame.columns.get_level_values(i).unique().drop("labels"))
            
        else:
            column_names.append(frame.columns.get_level_values(i).unique().drop(''))
    
    return column_names

In [292]:
def fix_first_time_point(df):
    new_columns = get_column_names(df)
    classes=[0,1,2]
    new_columns.append(classes)
    new_mux=pd.MultiIndex.from_product(iterables=new_columns, names=["modality", "base predictor", "sample", "class"])
    new_df = pd.DataFrame(columns=new_mux)

    for col in new_df.columns:
        if col[-1] == 0:
            new_df[col] = 1 - df[col[:-1]]
        elif col[-1] == 1:
            new_df[col] = df[col[:-1]]
        else:
            new_df[col] = 0
    
    new_df['labels'] = df['labels']

    return new_df

In [293]:
for i in range(len(RNN_training_data)):
    RNN_training_data[i][0] = fix_first_time_point(RNN_training_data[i][0])
    RNN_test_data[i][0] = fix_first_time_point(RNN_test_data[i][0])

  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']
  new_df['labels'] = df['labels']


In [302]:
from keras.models import Sequential
from keras.layers import LSTM,Dense
lstm = Sequential()
lstm.add(LSTM(units=50, input_shape=(5,240), return_sequences=True))
lstm.add(Dense(units=3, activation='softmax'))
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [303]:
from keras.utils import to_categorical

y_preds = []
for i in range(len(RNN_training_data)):
    
    labels_across_time = np.column_stack([df['labels'].values for df in RNN_training_data[i]])
    labels_across_time = np.eye(3)[labels_across_time]

    RNN_training_data_fold = [data.drop(columns=["labels"], axis=1) for data in RNN_training_data[i]]
    data_arrays_per_timepoint = [df.to_numpy() for df in RNN_training_data_fold]
    tensor_3d = np.stack(data_arrays_per_timepoint, axis=0)
    tensor_3d = np.transpose(tensor_3d, (1,0,2))
    

    lstm.fit(tensor_3d, labels_across_time)

    RNN_test_data_fold = [data.drop(columns=["labels"], axis=1) for data in RNN_test_data[i]]
    data_arrays_per_timepoint_test = [df.to_numpy() for df in RNN_test_data_fold]
    tensor_3d_test = np.stack(data_arrays_per_timepoint_test, axis=0)
    tensor_3d_test = np.transpose(tensor_3d_test, (1,0,2))

    y_preds.append(lstm.predict(tensor_3d_test))



  RNN_training_data_fold = [data.drop(columns=["labels"], axis=1) for data in RNN_training_data[i]]




  RNN_test_data_fold = [data.drop(columns=["labels"], axis=1) for data in RNN_test_data[i]]




In [304]:
y_preds[0][:,0,:]

array([[0.2892171 , 0.55862945, 0.15215343],
       [0.50869006, 0.34574533, 0.14556473],
       [0.43819684, 0.4228513 , 0.13895188],
       [0.4513141 , 0.3999439 , 0.14874193],
       [0.4006186 , 0.44250277, 0.15687862],
       [0.4422505 , 0.4032001 , 0.15454942],
       [0.5173019 , 0.31166005, 0.17103796],
       [0.4365838 , 0.4230678 , 0.14034826],
       [0.28535244, 0.5920469 , 0.12260057],
       [0.48393804, 0.34801358, 0.16804838],
       [0.51637983, 0.34004176, 0.14357844],
       [0.24066319, 0.63235205, 0.12698469],
       [0.47202122, 0.35804096, 0.1699377 ],
       [0.3278752 , 0.5234721 , 0.14865282],
       [0.24992186, 0.6335824 , 0.11649568],
       [0.3698078 , 0.48064047, 0.14955173],
       [0.25639412, 0.61504775, 0.12855807],
       [0.2340338 , 0.64874464, 0.11722147],
       [0.24333149, 0.6153802 , 0.14128833],
       [0.25898972, 0.606531  , 0.13447928],
       [0.49671724, 0.34253013, 0.1607526 ],
       [0.5274381 , 0.32509467, 0.14746724],
       [0.

In [305]:
y_preds_argmax = [np.argmax(pred, axis=-1) for pred in y_preds]
y_preds_argmax[0].shape

(141, 5)

In [334]:
for i in range(y_preds_argmax[0].shape[-1]):
    print(pd.Series(y_preds_argmax[0][:,i]).value_counts())

1    108
0     33
Name: count, dtype: int64
1    120
0     21
Name: count, dtype: int64
1    122
0     19
Name: count, dtype: int64
1    126
0     15
Name: count, dtype: int64
1    118
0     22
2      1
Name: count, dtype: int64


In [306]:
y_trues = []
for i in range(len(RNN_test_data)):
    y_true = pd.concat([data["labels"] for data in RNN_test_data[i]], axis=1).to_numpy()
    y_trues.append(y_true)


In [349]:
y_preds[0][:,1]

array([[0.25870648, 0.62872607, 0.11256742],
       [0.35726115, 0.49770045, 0.14503835],
       [0.2573836 , 0.6308709 , 0.11174551],
       [0.39560208, 0.49185944, 0.11253837],
       [0.38930804, 0.5056786 , 0.10501342],
       [0.32825482, 0.57680845, 0.09493675],
       [0.5113296 , 0.3614841 , 0.12718624],
       [0.49916416, 0.3993196 , 0.10151617],
       [0.22782141, 0.68628585, 0.08589279],
       [0.5741786 , 0.30780116, 0.1180202 ],
       [0.5843612 , 0.29264516, 0.12299377],
       [0.39028478, 0.4919643 , 0.11775091],
       [0.531603  , 0.35685268, 0.1115444 ],
       [0.48574847, 0.40921295, 0.10503855],
       [0.2015098 , 0.6946773 , 0.10381285],
       [0.2440068 , 0.65217906, 0.10381409],
       [0.22060096, 0.67907447, 0.10032459],
       [0.2254173 , 0.6770225 , 0.09756014],
       [0.23808244, 0.5909734 , 0.17094421],
       [0.22634669, 0.6688215 , 0.10483183],
       [0.3552478 , 0.4993132 , 0.14543894],
       [0.3188491 , 0.58286846, 0.09828252],
       [0.

In [351]:
from sklearn.metrics import roc_auc_score
aucs=[]
for i in range(len(y_preds)):
    aucs_for_fold=[]
    for j in range(1, y_preds[i].shape[-2]):
        aucs_for_fold.append(roc_auc_score(y_trues[i][:,j], y_preds[i][:,j], multi_class='ovr'))
    aucs.append(aucs_for_fold)

auc_df = pd.DataFrame(data=aucs, columns=["m06", "m12", "m24", "m36"])
auc_df


Unnamed: 0,m06,m12,m24,m36
0,0.854729,0.90045,0.885206,0.900067
1,0.893392,0.946689,0.914982,0.905733
2,0.954628,0.907251,0.9457,0.954376
3,0.966661,0.973757,0.954954,0.945488
4,0.946802,0.901527,0.960094,0.972329


In [346]:
from sklearn.metrics import f1_score
f1s=[]
for i in range(len(y_preds_argmax)):
    f1s_for_fold=[]
    for j in range(y_preds_argmax[i].shape[-1]):
        f1s_for_fold.append(f1_score(y_preds_argmax[i][:,j], y_trues[i][:,j], average='micro'))
    f1s.append(f1s_for_fold)

f1_df = pd.DataFrame(data=f1s, columns=["bl", "m06", "m12", "m24", "m36"])
f1_df


Unnamed: 0,bl,m06,m12,m24,m36
0,0.950355,0.829787,0.758865,0.631206,0.624113
1,0.957447,0.879433,0.808511,0.787234,0.765957
2,0.971429,0.907143,0.864286,0.85,0.857143
3,0.971429,0.935714,0.928571,0.857143,0.842857
4,0.992857,0.928571,0.842857,0.85,0.864286


In [353]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

confusion_matrices = [ [confusion_matrix(y_pred=y_preds_argmax[i][:,j], y_true=y_trues[i][:,j]) for j in range(5)] for i in range(len(y_trues))]

In [358]:
classification_reports = [ [classification_report(y_pred=y_preds_argmax[i][:,j], y_true=y_trues[i][:,j]) for j in range(5)] for i in range(len(y_trues))]

In [366]:
for j in range(5):
    print(classification_report(y_pred=y_preds_argmax[0][:,j], y_true=y_trues[0][:,j]))

              precision    recall  f1-score   support

           0       1.00      0.82      0.90        40
           1       0.94      1.00      0.97       101

    accuracy                           0.95       141
   macro avg       0.97      0.91      0.94       141
weighted avg       0.95      0.95      0.95       141

              precision    recall  f1-score   support

           0       1.00      0.54      0.70        39
           1       0.80      1.00      0.89        96
           2       0.00      0.00      0.00         6

    accuracy                           0.83       141
   macro avg       0.60      0.51      0.53       141
weighted avg       0.82      0.83      0.80       141

              precision    recall  f1-score   support

           0       0.95      0.46      0.62        39
           1       0.73      0.99      0.84        90
           2       0.00      0.00      0.00        12

    accuracy                           0.76       141
   macro avg       0