# **A Novel Approach for Three-Way Classification of Lumbar Spine Degeneration Using Pseudo-Modality Learning to Handle Missing MRI Data**

## Libs

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score

## Training Pipeline

In [2]:
def train(embedding_path, labels_path, loss_fn='binary_crossentropy', n_splits=5):
    embeddings = pd.read_csv(embedding_path)
    labels = pd.read_csv(labels_path)
    
    id_cols = labels[['study_id', 'series_id']]
    cols_to_impute = labels.drop(columns=['study_id', 'series_id'])
    imputed_cols = cols_to_impute.apply(lambda x: x.fillna(x.mode()[0]))
    final_df = pd.concat([id_cols, imputed_cols], axis=1)
    labels = final_df

    id_cols = labels[['study_id', 'series_id']]
    cols_to_encode = labels.drop(columns=['study_id', 'series_id'])
    encoder = OneHotEncoder(sparse_output=False)
    encoded_cols = encoder.fit_transform(cols_to_encode)
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cols_to_encode.columns))
    final_df = pd.concat([id_cols, encoded_df], axis=1)
    df = pd.merge(embeddings, final_df, on='study_id', how='inner')

    fc_layer = len(df.columns[515:])
    X = df.iloc[:, :512].values
    Y = df.iloc[:, 515:].values

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_val_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    
    best_test_accuracy = 0.0
    best_val_accuracy = 0.0

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = Sequential([
            Input(shape=(512,)),
            Dense(256, activation='relu'),
            Dense(128, activation='relu'),
            Dense(64, activation='relu'),
            Dense(fc_layer, activation='sigmoid')
        ])

        model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
        history = model.fit(X_train, Y_train, epochs=20, batch_size=16, verbose=0, validation_data=(X_test, Y_test))

        loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
        fold_accuracies.append(accuracy)

        val_accuracy = np.mean(history.history['val_accuracy'])
        fold_val_accuracies.append(val_accuracy)

        # Predict on test set
        Y_pred = model.predict(X_test)

        # Calculate F1 Score and ROC AUC
        f1 = f1_score(Y_test, (Y_pred > 0.5).astype(int), average='macro', zero_division=1)
        
        if Y_test.sum(axis=0).min() == 0:
            print(f"Warning: Skipping ROC AUC for fold {fold} due to missing classes.")
            roc_auc = np.nan
        else:
            try:
                roc_auc = roc_auc_score(Y_test, Y_pred, average='macro', multi_class='ovr')
            except ValueError as e:
                print(f"Error calculating ROC AUC for fold {fold}: {e}.")
                roc_auc = np.nan

        fold_f1_scores.append(f1)
        fold_roc_auc_scores.append(roc_auc)

        if accuracy > best_test_accuracy:
            best_test_accuracy = accuracy
            best_val_accuracy = val_accuracy

        print(f'Fold {fold}: Test Accuracy = {accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}, '
              f'F1 Score = {f1:.4f}, ROC AUC = {roc_auc:.4f}')

    avg_accuracy = np.mean(fold_accuracies)
    avg_val_accuracy = np.mean(fold_val_accuracies)
    avg_f1_score = np.mean(fold_f1_scores)
    avg_roc_auc = np.mean(fold_roc_auc_scores)

    print(f'Average Test Accuracy across {n_splits} folds: {avg_accuracy:.4f}, '
          f'Average Validation Accuracy across {n_splits} folds: {avg_val_accuracy:.4f}, '
          f'Average F1 Score across {n_splits} folds: {avg_f1_score:.4f}, '
          f'Average ROC AUC across {n_splits} folds: {avg_roc_auc:.4f}')

    return avg_accuracy, avg_val_accuracy, best_test_accuracy, best_val_accuracy, avg_f1_score, avg_roc_auc


In [3]:
attention_embeddings_paths = [
    '/kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_hist.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST1_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST1_attention_embeddings_hist.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST2_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST2_attention_embeddings_hist.csv'    
]

average_embeddings_paths = [
    '/kaggle/input/embeddings-for-rsna/at2-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/at2-hist/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st1-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st1-hist/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st2-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st2-hist/final_embeddings.csv'
]

medicalnet_embeddings_paths = [
    '/kaggle/input/medicalnet-attention-layers-for-rsna/AT2_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/AT2_attention_embeddings_hist.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST1_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST1_attention_embeddings_hist.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST2_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST2_attention_embeddings_hist.csv',
]

labels_paths = [
    '/kaggle/input/preprocessed-dataset/train_data_AT2.csv',
    '/kaggle/input/preprocessed-dataset/train_data_ST1.csv',
    '/kaggle/input/preprocessed-dataset/train_data_ST2.csv'
]

list_of_combination = [
    'AT2 - GSL - Attention Network',
    'AT2 - HIST - Attention Network',
    'ST1 - GSL - Attention Network',
    'ST1 - HIST - Attention Network',
    'ST2 - GSL - Attention Network',
    'ST2 - HIST - Attention Network',
    
    'AT2 - GSL - Average ResNet50',
    'AT2 - HIST - Average ResNet50',
    'ST1 - GSL - Average ResNet50',
    'ST1 - HIST - Average ResNet50',
    'ST2 - GSL - Average ResNet50',
    'ST2 - HIST - Average ResNet50',
    
    'AT2 - GSL - MedicalNet Network',
    'AT2 - HIST - MedicalNet Network',
    'ST1 - GSL - MedicalNet Network',
    'ST1 - HIST - MedicalNet Network',
    'ST2 - GSL - MedicalNet Network',
    'ST2 - HIST - MedicalNet Network'
]

results_df = pd.DataFrame(columns=['Combination', 'Avg_Test_Accuracy', 'Avg_Val_Accuracy'])

all_embedding_paths = [
    *attention_embeddings_paths,
    *average_embeddings_paths,
    *medicalnet_embeddings_paths
]

corresponding_labels_paths = [
    labels_paths[0],  # AT2 - GSL - Attention Network
    labels_paths[0],  # AT2 - HIST - Attention Network
    labels_paths[1],  # ST1 - GSL - Attention Network
    labels_paths[1],  # ST1 - HIST - Attention Network
    labels_paths[2],  # ST2 - GSL - Attention Network
    labels_paths[2],  # ST2 - HIST - Attention Network
    
    labels_paths[0],  # AT2 - GSL - Average ResNet50
    labels_paths[0],  # AT2 - HIST - Average ResNet50
    labels_paths[1],  # ST1 - GSL - Average ResNet50
    labels_paths[1],  # ST1 - HIST - Average ResNet50
    labels_paths[2],  # ST2 - GSL - Average ResNet50
    labels_paths[2],  # ST2 - HIST - Average ResNet50
    
    labels_paths[0],  # AT2 - GSL - MedicalNet Network
    labels_paths[0],  # AT2 - HIST - MedicalNet Network
    labels_paths[1],  # ST1 - GSL - MedicalNet Network
    labels_paths[1],  # ST1 - HIST - MedicalNet Network
    labels_paths[2],  # ST2 - GSL - MedicalNet Network
    labels_paths[2],  # ST2 - HIST - MedicalNet Network
]

In [4]:
results_df = pd.DataFrame(columns=['Combination', 'Avg_Test_Accuracy', 'Avg_Val_Accuracy', 'Best_Test_Accuracy', 'Best_Val_Accuracy', 'Avg_F1_Score', 'Avg_ROC_AUC'])

for embedding_path, label_path, name in zip(all_embedding_paths, corresponding_labels_paths, list_of_combination):
    print(f"\nTraining for: {name} - {embedding_path}")
    avg_accuracy, avg_val_accuracy, best_test_accuracy, best_val_accuracy, avg_f1_score, avg_roc_auc = train(embedding_path, label_path)
    
    result_row = pd.DataFrame({
        'Combination': [name],
        'Avg_Test_Accuracy': [avg_accuracy],
        'Avg_Val_Accuracy': [avg_val_accuracy],
        'Best_Test_Accuracy': [best_test_accuracy],
        'Best_Val_Accuracy': [best_val_accuracy],
        'Avg_F1_Score' : [avg_f1_score], 
        'Avg_ROC_AUC': [avg_roc_auc],
    })
    
    results_df = pd.concat([results_df, result_row], ignore_index=True)

print("\nFinal Results DataFrame:")


Training for: AT2 - GSL - Attention Network - /kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_gsl.csv
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 1: Test Accuracy = 0.2328, Validation Accuracy = 0.3526, F1 Score = 0.6685, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 2: Test Accuracy = 0.2714, Validation Accuracy = 0.3071, F1 Score = 0.6835, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 3: Test Accuracy = 0.3015, Validation Accuracy = 0.2141, F1 Score = 0.6907, ROC AUC = 0.8457
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 4: Test Accuracy = 0.1591, Validation Accuracy = 0.2263, F1 Score = 0.7356, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 5: Test Accuracy = 0.2097, Validation Accuracy = 0.2801, F1 Score = 0.6693, ROC AUC = 0.8487
Average Test Accuracy across 5

  results_df = pd.concat([results_df, result_row], ignore_index=True)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 1: Test Accuracy = 0.3869, Validation Accuracy = 0.3480, F1 Score = 0.6552, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 2: Test Accuracy = 0.3049, Validation Accuracy = 0.2818, F1 Score = 0.6749, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 3: Test Accuracy = 0.2513, Validation Accuracy = 0.3458, F1 Score = 0.6932, ROC AUC = 0.8443
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 4: Test Accuracy = 0.2328, Validation Accuracy = 0.3564, F1 Score = 0.7201, ROC AUC = nan
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Fold 5: Test Accuracy = 0.2164, Validation Accuracy = 0.1988, F1 Score = 0.6394, ROC AUC = 0.8105
Average Test Accuracy across 5 folds: 0.2785, Average Validation Accuracy across 5 folds: 0.3062, Average F1 Score across 5 folds: 0.6766, Average ROC AUC

In [5]:
results_df

Unnamed: 0,Combination,Avg_Test_Accuracy,Avg_Val_Accuracy,Best_Test_Accuracy,Best_Val_Accuracy,Avg_F1_Score,Avg_ROC_AUC
0,AT2 - GSL - Attention Network,0.234911,0.27604,0.301508,0.21407,0.689527,
1,AT2 - HIST - Attention Network,0.278464,0.306164,0.386935,0.34799,0.676578,
2,ST1 - GSL - Attention Network,0.294794,0.343626,0.377309,0.394723,0.356223,0.619086
3,ST1 - HIST - Attention Network,0.2821,0.368207,0.335092,0.320712,0.36248,0.627817
4,ST2 - GSL - Attention Network,0.303843,0.31628,0.437333,0.309467,0.341935,
5,ST2 - HIST - Attention Network,0.264901,0.290281,0.31117,0.273803,0.354975,
6,AT2 - GSL - Average ResNet50,0.470814,0.54464,0.576214,0.519095,0.412726,
7,AT2 - HIST - Average ResNet50,0.46582,0.538888,0.522613,0.536516,0.479706,
8,ST1 - GSL - Average ResNet50,0.465318,0.535535,0.598945,0.577441,0.277259,0.575243
9,ST1 - HIST - Average ResNet50,0.446298,0.52381,0.567282,0.53905,0.306173,0.600305


In [6]:
def random_predictions(path):
    labels = pd.read_csv(path)

    id_cols = labels[['study_id', 'series_id']]
    cols_to_impute = labels.drop(columns=['study_id', 'series_id'])
    imputed_cols = cols_to_impute.apply(lambda x: x.fillna(x.mode()[0]))
    final_df = pd.concat([id_cols, imputed_cols], axis=1)

    id_cols = final_df[['study_id', 'series_id']]
    cols_to_encode = final_df.drop(columns=['study_id', 'series_id'])
    encoder = OneHotEncoder(sparse_output=False)
    encoded_cols = encoder.fit_transform(cols_to_encode)
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cols_to_encode.columns))
    final_df = pd.concat([id_cols, encoded_df], axis=1)

    Y_true = final_df.drop(columns=['study_id', 'series_id']).values

    np.random.seed(42)  # For reproducibility
    random_predictions = np.random.rand(*Y_true.shape)  # Random floats in [0.0, 1.0]

    threshold = np.mean(random_predictions)

    binary_predictions = (random_predictions > threshold).astype(float)

    accuracy = np.mean(np.all(binary_predictions == Y_true, axis=1))

    print(f"Random Predictions Accuracy: {accuracy:.4f}")
    print(f"Self-adjusting Threshold: {threshold:.4f}")

## Random Accuracy

In [7]:
random_predictions('/kaggle/input/preprocessed-dataset/train_data_AT2.csv')
random_predictions('/kaggle/input/preprocessed-dataset/train_data_ST1.csv')
random_predictions('/kaggle/input/preprocessed-dataset/train_data_ST2.csv')

Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.4996
Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.4995
Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.5004
