In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import eipy.ei as e

In [None]:
from eipy.metrics import fmax_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef

metrics = {
            'f_max': fmax_score,
            'auc': roc_auc_score,
            'mcc': matthews_corrcoef
            }

In [None]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True),
}

In [None]:
import pickle as pkl
with open("/home/opc/eipy/tadpole/tadpole_data_time_imptn_norm_v1.pickle", "rb") as file:
    data = pkl.load(file)
with open("/home/opc/eipy/tadpole/tadpole_labels_time_imptn_norm_v1.pickle", "rb") as file:
    labels = pkl.load(file)

In [None]:
data = [data[k] for k in data.keys()]

In [None]:
for k,v in labels.items():
    labels[k] = v.reset_index(drop=True)

In [None]:
#intermediate transformation to make sure labels are ordered correctly in time
labels = pd.DataFrame(labels)

labels = labels.to_numpy()

In [None]:
#for multiclass version of data
encoding_dict = {'NL': 0, 'MCI': 1, 'Dementia': 2}

labels = np.vectorize(lambda x: encoding_dict[x])(labels)

In [None]:
labels.shape

In [None]:
#prevalence of each label across time
for i in range(5):
    print(pd.Series(labels[:,i]).value_counts())

In [None]:
'''misalign data'''
data = data[1:]
labels = labels[:,:-1]

In [None]:
meta_data = []
for t in range(len(data)):
    #time dependent data splitting
    X_train_test_timestep = data[t]
    labels_at_timestep = labels[:, t]
    EI_for_timestep = e.EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy=None,
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        metrics=metrics,
                        random_state=38,
                        project_name=f"time step {t}",
                        model_building=False,
                        )
    print(f"generating metadata for timestep {t}")
    EI_for_timestep.fit_base(X_train_test_timestep, labels_at_timestep)
    meta_data.append([EI_for_timestep.ensemble_training_data, EI_for_timestep.ensemble_test_data, EI_for_timestep.ensemble_training_data_final, EI_for_timestep.base_summary])


In [None]:
RNN_training_data = [[dfs[0][i] for dfs in meta_data] for i in range(5)]
RNN_test_data = [[dfs[1][i] for dfs in meta_data] for i in range(5)]
RNN_training_data_final = [df[2] for df in meta_data]

### make first time point in meta-data multiclass (if necessary)

In [None]:
def get_column_names(df):
    column_names = []
    for i in range(df.columns.nlevels):
        if i == 0:
            column_names.append(df.columns.get_level_values(i).unique().drop("labels"))
            
        else:
            column_names.append(df.columns.get_level_values(i).unique().drop(''))
    
    return column_names

def fix_first_time_point(df):
    new_columns = get_column_names(df)
    classes=[0,1,2]
    new_columns.append(classes)
    new_mux=pd.MultiIndex.from_product(iterables=new_columns, names=["modality", "base predictor", "sample", "class"])
    new_df = pd.DataFrame(columns=new_mux)

    for col in new_df.columns:
        if col[-1] == 0:
            new_df[col] = 1 - df[col[:-1]]
        elif col[-1] == 1:
            new_df[col] = df[col[:-1]]
        else:
            new_df[col] = 0
    
    new_df['labels'] = df['labels']

    return new_df

In [None]:
for i in range(len(RNN_training_data)):
    RNN_training_data[i][0] = fix_first_time_point(RNN_training_data[i][0])
    RNN_test_data[i][0] = fix_first_time_point(RNN_test_data[i][0])

# TIME SERIES TIME

In [None]:
'''Alternate RNN loss function for ordinal labels'''
from keras import backend as K

def ordinal_regression_loss(y_true, y_pred):
    """
    Ordinal regression loss function.
    """


    # Calculate cumulative probabilities for true and predicted labels
    true_cum_probs = K.cumsum(K.softmax(y_true, axis=-1), axis=-1)
    pred_cum_probs = K.cumsum(K.softmax(y_pred, axis=-1), axis=-1)

    # Calculate the ordinal regression loss
    loss = K.sum((true_cum_probs - pred_cum_probs) ** 2)

    return loss

In [None]:
y_preds = [] # LSTM predictions at every time point. Will be populated by 5 arrays
for i in range(len(RNN_training_data)):
    from keras.models import Sequential
    from keras.layers import LSTM,Dense
    lstm = Sequential()
    lstm.add(LSTM(units=50+10*i, input_shape=(4,240), return_sequences=True))
    lstm.add(Dense(units=3, activation='softmax'))
    lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) #loss='categorical_crossentropy'

    #reformat labels
    labels_across_time = np.column_stack([df['labels'].values for df in RNN_training_data[i]])
    labels_across_time = np.eye(3)[labels_across_time]
    # reformat data
    RNN_training_data_fold = [df.drop(columns=["labels"], axis=1, level=0) for df in RNN_training_data[i]]
    data_arrays_per_timepoint = [df.to_numpy() for df in RNN_training_data_fold]
    tensor_3d = np.stack(data_arrays_per_timepoint, axis=1)

    lstm.fit(tensor_3d, labels_across_time)

    #reformat test data
    RNN_test_data_fold = [data.drop(columns=["labels"], axis=1, level=0) for data in RNN_test_data[i]]
    data_arrays_per_timepoint_test = [df.to_numpy() for df in RNN_test_data_fold]
    tensor_3d_test = np.stack(data_arrays_per_timepoint_test, axis=1)


    y_preds.append(lstm.predict(tensor_3d_test))

In [None]:
y_preds_argmax = [np.argmax(pred, axis=-1) for pred in y_preds]

In [None]:
y_trues = []
for i in range(len(RNN_test_data)):
    y_true = pd.concat([data["labels"] for data in RNN_test_data[i]], axis=1).to_numpy()
    y_trues.append(y_true)

In [None]:
from sklearn.metrics import classification_report
for i in range(len(y_preds_argmax)):
    print(f" \n FOLD {i+1} \n")
    for j in range(4):
        print(classification_report(y_pred=y_preds_argmax[i][:,j], y_true=y_trues[i][:,j]))

In [None]:
'''get mean and standard deviation of dementia class f1 score across folds'''

from sklearn.metrics import precision_recall_fscore_support
dem_f1 = []
for i in range(len(y_preds_argmax)):
    precision, recall, f1, support = precision_recall_fscore_support(y_pred=y_preds_argmax[i][:,-1], y_true=y_trues[i][:,-1])
    dem_f1.append(f1[2])

np.mean(dem_f1), np.std(dem_f1)