In [1]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pandas as pd
import eipy.ei as e
from eipy.additional_ensembles import MeanAggregation, CES

In [2]:
from eipy.metrics import fmax_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef

metrics = {
            'f_max': fmax_score,
            'auc': roc_auc_score,
            'mcc': matthews_corrcoef
            }

In [3]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True),
}

## data generation

In [4]:
import numpy as np

np.random.seed(38)

n_samples = 600

### single label data

In [5]:
# import numpy as np
# n_samples = 600

# n_time_steps = 8

# n_features = 30

# n_classes = 3 # 2 or 3


# if n_classes == 2:
#     # Generate toy dataset with regularity
#     X_class0 = np.random.randn(n_samples // 2, n_features, n_time_steps) + 1.5  # Add a bias to Class 0
#     X_class1 = np.random.randn(n_samples // 2, n_features, n_time_steps) - 1.5  # Subtract a bias from Class 1
#     X = np.concatenate([X_class0, X_class1])

#     # Generate labels (two classes)
#     y = np.concatenate([np.zeros(n_samples // 2), np.ones(n_samples // 2)])

# else:
#     # Generate toy dataset with less order and three classes
#     X_class0 = np.random.randn(n_samples // 3, n_features, n_time_steps) + 1.5  # Add a bias to Class 0
#     X_class1 = np.random.randn(n_samples // 3, n_features, n_time_steps) - 1.5  # Subtract a bias from Class 1
#     X_class2 = np.random.randn(n_samples // 3, n_features, n_time_steps) * 0.5  # Introduce disorder in Class 2
#     X = np.concatenate([X_class0, X_class1, X_class2])

#     # Generate labels (three classes)
#     y = np.concatenate([np.zeros(n_samples // 3), np.ones(n_samples // 3), 2 * np.ones(n_samples // 3)])

# # Shuffle the dataset
# shuffle_indices = np.random.permutation(n_samples)
# X = X[shuffle_indices]
# y = y[shuffle_indices]

### time dependent label data

In [6]:
import numpy as np

n_time_steps = 8
n_features = 30
n_classes = 3  # 2 or 3

if n_classes == 2:
    # Generate toy dataset with regularity
    X_class0 = np.random.randn(n_samples // 2, n_features, n_time_steps) + 1.5  # Add a bias to Class 0
    X_class1 = np.random.randn(n_samples // 2, n_features, n_time_steps) - 1.5  # Subtract a bias from Class 1
    X = np.concatenate([X_class0, X_class1])

    # Generate labels (two classes)
    y_class0 = np.zeros((n_samples // 2, n_time_steps))
    y_class1 = np.ones((n_samples // 2, n_time_steps))
    y = np.concatenate([y_class0, y_class1])

else:
    # Generate toy dataset with less order and three classes
    X_class0 = np.random.randn(n_samples // 3, n_features, n_time_steps) + 1.5  # Add a bias to Class 0
    X_class1 = np.random.randn(n_samples // 3, n_features, n_time_steps) - 1.5  # Subtract a bias from Class 1
    X_class2 = np.random.randn(n_samples // 3, n_features, n_time_steps) * 0.5  # Introduce disorder in Class 2
    X = np.concatenate([X_class0, X_class1, X_class2])

    # Generate labels (three classes)
    y_class0 = np.zeros((n_samples // 3, n_time_steps))
    y_class1 = np.ones((n_samples // 3, n_time_steps))
    y_class2 = 2 * np.ones((n_samples // 3, n_time_steps))
    y = np.concatenate([y_class0, y_class1, y_class2])

# Shuffle the dataset
shuffle_indices = np.random.permutation(n_samples)
X = X[shuffle_indices]
y = y[shuffle_indices]

In [7]:
y.shape

(600, 8)

## Training
BPs for every time point, train LSTM, gather outputs for classification

In [8]:
from sklearn.model_selection import train_test_split

meta_data = []
for t in range(n_time_steps):
    #time dependent data splitting
    labels_at_timestep = y[:, t]
    X_train, X_test, y_train, y_test = train_test_split(X, labels_at_timestep, test_size=0.25, random_state=38)
    #modalities
    features_per_modality = int(n_features/3)
    X_train_dict = {f"Modality_{i}" : X_train[:,features_per_modality*i: features_per_modality*(i+1),:] for i in range(3)}
    X_test_dict = {f"Modality_{i}" : X_test[:,features_per_modality*i: features_per_modality*(i+1),:] for i in range(3)}
    X_train_timestep = {k: v[:,:,t] for k,v in X_train_dict.items()}

    EI_for_timestep = e.EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy=None,
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        metrics=metrics,
                        random_state=38,
                        project_name=f"time step {t}",
                        model_building=False,
                        )
    print(f"generating metadata for timestep {t}")
    EI_for_timestep.fit_base(X_train_timestep, y_train)
    meta_data.append([EI_for_timestep.ensemble_training_data, EI_for_timestep.ensemble_test_data, EI_for_timestep.ensemble_training_data_final, EI_for_timestep.base_summary])

generating metadata for timestep 0
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 1
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 2
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 3
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 4
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 5
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 6
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




generating metadata for timestep 7
Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating ensemble training data: |██████████|100%
Generating ensemble test data: |██████████|100%






rearrange metadata so it's grouped by fold and then ordered by time steps

In [9]:
RNN_training_data = [[dfs[0][i] for dfs in meta_data] for i in range(5)]
RNN_test_data = [[dfs[1][i] for dfs in meta_data] for i in range(5)]
RNN_training_data_final = [df[2] for df in meta_data]

In [10]:
RNN_training_data_fold_zero = [data.drop(columns=["labels"], axis=1) for data in RNN_training_data[0]]
data_arrays_per_timepoint = [df.to_numpy() for df in RNN_training_data_fold_zero]

tensor_3d = np.stack(data_arrays_per_timepoint, axis=0)
tensor_3d = np.transpose(tensor_3d, (1,0,2))

In [15]:
from keras.utils import to_categorical
training_labels_fold_zero = RNN_training_data[0][0]["labels"].to_numpy()
training_labels_fold_zero = np.repeat(training_labels_fold_zero[:, np.newaxis], tensor_3d.shape[1], axis=1)
training_labels_fold_zero = to_categorical(training_labels_fold_zero, num_classes=3)

## single lstm seq2seq model

In [16]:
from keras.models import Sequential
from keras.layers import LSTM,Dense
lstm = Sequential()
lstm.add(LSTM(units=50, input_shape=(8,90), return_sequences=True))
lstm.add(Dense(units=3, activation='softmax'))
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [17]:
tensor_3d.shape, training_labels_fold_zero.shape

((360, 8, 90), (360, 8, 3))

In [18]:
lstm.fit(tensor_3d, training_labels_fold_zero)



<keras.src.callbacks.History at 0x7f40d82b0be0>

In [20]:
RNN_test_data_fold_zero = [data.drop(columns=["labels"], axis=1) for data in RNN_test_data[0]]
data_arrays_per_timepoint_test = [df.to_numpy() for df in RNN_test_data_fold_zero]

tensor_3d_test = np.stack(data_arrays_per_timepoint_test, axis=0)
tensor_3d_test = np.transpose(tensor_3d_test, (1,0,2))

In [21]:
y_pred=lstm.predict(tensor_3d_test)



In [22]:
y_pred.shape

(90, 8, 3)

In [23]:
test_labels_fold_zero = RNN_test_data[0][0]["labels"].to_numpy()
test_labels_fold_zero = np.repeat(test_labels_fold_zero[:, np.newaxis], tensor_3d.shape[1], axis=1)
test_labels_fold_zero = to_categorical(test_labels_fold_zero, num_classes=3)

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_true_flat = np.argmax(test_labels_fold_zero, axis=2).flatten()
y_pred_flat = np.argmax(y_pred, axis=2).flatten()

conf_matrix = confusion_matrix(y_true_flat, y_pred_flat)
accuracy_score(y_true_flat, y_pred_flat)

1.0

In [None]:
####HI OVER HERE###
import importlib
importlib.reload(e)
###HEYOOOOOO#

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Masking

performance_for_timesteps = []
for t in range(n_time_steps):
    
    #Defining models for time dependent inputs#
    lstm = Sequential()

    if n_classes == 2:
        lstm.add(LSTM(units=50, input_shape=(t+1,30))) # return_sequences=True to get prediction vectors at every time step
        lstm.add(Dense(units=1, activation='sigmoid'))
        lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else:
        # masking_layer = Masking(mask_value=0.0, input_shape=(None,30*n_classes))
        lstm.add(LSTM(units=50, input_shape=(t+1,30*n_classes), return_sequences=False))
        lstm.add(Dense(units=3, activation='softmax'))
        lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    gru = Sequential()
    
    if n_classes == 2:
        gru.add(GRU(units=50, input_shape=(t+1, 30)))
        gru.add(Dense(units=1, activation='sigmoid'))
        gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else:
        gru.add(GRU(units=50, input_shape=(t+1, 30*n_classes)))
        gru.add(Dense(units=3, activation='softmax'))
        gru.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    ensembles = {"LSTM": lstm
                #  ,"GRU": gru
            }

    ensemble_training_data = [x[:t+1] for x in RNN_training_data]
    ensemble_test_data = [x[:t+1] for x in RNN_test_data]
    # ensemble_training_data_final = [x[:t] for x in RNN_training_data_final]
    RNNs = e.EnsembleIntegration(
                base_predictors=base_predictors,
                k_outer=5,
                k_inner=5,
                n_samples=1,
                sampling_strategy=None,
                sampling_aggregation=None,
                n_jobs=-1,
                metrics=metrics,
                random_state=38,
                project_name="test",
                model_building=False,
                )
    RNNs.modality_names = ["stuff"]
    RNNs.ensemble_training_data = ensemble_training_data
    RNNs.ensemble_test_data = ensemble_test_data
    # RNNs.ensemble_training_data_final = ensemble_training_data_final

    RNNs.fit_ensemble(ensemble_predictors=ensembles)
    performance_for_timesteps.append(RNNs.ensemble_summary["metrics"])


In [None]:
dfs = performance_for_timesteps
result_df = pd.concat(dfs, axis=1, keys=range(len(dfs)))

In [None]:
result_df