In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import KFold

In [2]:
def train(embedding_path, labels_path, loss_fn='binary_crossentropy', n_splits=5):
    embeddings = pd.read_csv(embedding_path)
    labels = pd.read_csv(labels_path)
    
    id_cols = labels[['study_id', 'series_id']]
    cols_to_impute = labels.drop(columns=['study_id', 'series_id'])
    imputed_cols = cols_to_impute.apply(lambda x: x.fillna(x.mode()[0]))
    final_df = pd.concat([id_cols, imputed_cols], axis=1)
    labels = final_df

    id_cols = labels[['study_id', 'series_id']]
    cols_to_encode = labels.drop(columns=['study_id', 'series_id'])
    encoder = OneHotEncoder(sparse_output=False)
    encoded_cols = encoder.fit_transform(cols_to_encode)
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cols_to_encode.columns))
    final_df = pd.concat([id_cols, encoded_df], axis=1)
    df = pd.merge(embeddings, final_df, on='study_id', how='inner')

    fc_layer = len(df.columns[515:])
    X = df.iloc[:, :512].values
    Y = df.iloc[:, 515:].values

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_val_accuracies = []
    best_test_accuracy = 0.0
    best_val_accuracy = 0.0
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = Sequential([
            Input(shape=(512,)),
            Dense(256, activation='relu'),
            Dense(128, activation='relu'),
            Dense(64, activation='relu'),
            Dense(fc_layer, activation='sigmoid')
        ])

        model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
        history = model.fit(X_train, Y_train, epochs=20, batch_size=16, verbose=0, validation_data=(X_test, Y_test))
        
        loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
        fold_accuracies.append(accuracy)
        val_accuracy = sum(history.history['val_accuracy']) / len(history.history['val_accuracy'])
        fold_val_accuracies.append(val_accuracy)

        if accuracy > best_test_accuracy:
            best_test_accuracy = accuracy
            best_val_accuracy = val_accuracy

        print(f'Fold {fold}: Test Accuracy = {accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}')

    avg_accuracy = np.mean(fold_accuracies)
    avg_val_accuracy = np.mean(fold_val_accuracies)
    print(f'Average Test Accuracy across {n_splits} folds: {avg_accuracy:.4f}, Average Validation Accuracy across {n_splits} folds: {avg_val_accuracy:.4f}')
    
    return avg_accuracy, avg_val_accuracy, best_test_accuracy, best_val_accuracy

In [3]:
attention_embeddings_paths = [
    '/kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_hist.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST1_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST1_attention_embeddings_hist.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST2_attention_embeddings_gsl.csv',
    '/kaggle/input/attention-embeddings-for-rsna/ST2_attention_embeddings_hist.csv'    
]

average_embeddings_paths = [
    '/kaggle/input/embeddings-for-rsna/at2-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/at2-hist/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st1-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st1-hist/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st2-greyscl/final_embeddings.csv',
    '/kaggle/input/embeddings-for-rsna/st2-hist/final_embeddings.csv'
]

medicalnet_embeddings_paths = [
    '/kaggle/input/medicalnet-attention-layers-for-rsna/AT2_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/AT2_attention_embeddings_hist.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST1_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST1_attention_embeddings_hist.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST2_attention_embeddings_gsl.csv',
    '/kaggle/input/medicalnet-attention-layers-for-rsna/ST2_attention_embeddings_hist.csv',
]

labels_paths = [
    '/kaggle/input/preprocessed-dataset/train_data_AT2.csv',
    '/kaggle/input/preprocessed-dataset/train_data_ST1.csv',
    '/kaggle/input/preprocessed-dataset/train_data_ST2.csv'
]

list_of_combination = [
    'AT2 - GSL - Attention Network',
    'AT2 - HIST - Attention Network',
    'ST1 - GSL - Attention Network',
    'ST1 - HIST - Attention Network',
    'ST2 - GSL - Attention Network',
    'ST2 - HIST - Attention Network',
    
    'AT2 - GSL - Average ResNet50',
    'AT2 - HIST - Average ResNet50',
    'ST1 - GSL - Average ResNet50',
    'ST1 - HIST - Average ResNet50',
    'ST2 - GSL - Average ResNet50',
    'ST2 - HIST - Average ResNet50',
    
    'AT2 - GSL - MedicalNet Network',
    'AT2 - HIST - MedicalNet Network',
    'ST1 - GSL - MedicalNet Network',
    'ST1 - HIST - MedicalNet Network',
    'ST2 - GSL - MedicalNet Network',
    'ST2 - HIST - MedicalNet Network'
]

results_df = pd.DataFrame(columns=['Combination', 'Avg_Test_Accuracy', 'Avg_Val_Accuracy'])

all_embedding_paths = [
    *attention_embeddings_paths,
    *average_embeddings_paths,
    *medicalnet_embeddings_paths
]

corresponding_labels_paths = [
    labels_paths[0],  # AT2 - GSL - Attention Network
    labels_paths[0],  # AT2 - HIST - Attention Network
    labels_paths[1],  # ST1 - GSL - Attention Network
    labels_paths[1],  # ST1 - HIST - Attention Network
    labels_paths[2],  # ST2 - GSL - Attention Network
    labels_paths[2],  # ST2 - HIST - Attention Network
    
    labels_paths[0],  # AT2 - GSL - Average ResNet50
    labels_paths[0],  # AT2 - HIST - Average ResNet50
    labels_paths[1],  # ST1 - GSL - Average ResNet50
    labels_paths[1],  # ST1 - HIST - Average ResNet50
    labels_paths[2],  # ST2 - GSL - Average ResNet50
    labels_paths[2],  # ST2 - HIST - Average ResNet50
    
    labels_paths[0],  # AT2 - GSL - MedicalNet Network
    labels_paths[0],  # AT2 - HIST - MedicalNet Network
    labels_paths[1],  # ST1 - GSL - MedicalNet Network
    labels_paths[1],  # ST1 - HIST - MedicalNet Network
    labels_paths[2],  # ST2 - GSL - MedicalNet Network
    labels_paths[2],  # ST2 - HIST - MedicalNet Network
]

In [4]:
results_df = pd.DataFrame(columns=['Combination', 'Avg_Test_Accuracy', 'Avg_Val_Accuracy', 'Best_Test_Accuracy', 'Best_Val_Accuracy'])

for embedding_path, label_path, name in zip(all_embedding_paths, corresponding_labels_paths, list_of_combination):
    print(f"\nTraining for: {name} - {embedding_path}")
    
    avg_accuracy, avg_val_accuracy, best_test_accuracy, best_val_accuracy = train(embedding_path, label_path)
    
    result_row = pd.DataFrame({
        'Combination': [name],
        'Avg_Test_Accuracy': [avg_accuracy],
        'Avg_Val_Accuracy': [avg_val_accuracy],
        'Best_Test_Accuracy': [best_test_accuracy],
        'Best_Val_Accuracy': [best_val_accuracy]
    })
    
    results_df = pd.concat([results_df, result_row], ignore_index=True)

print("\nFinal Results DataFrame:")
results_df


Training for: AT2 - GSL - Attention Network - /kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_gsl.csv


I0000 00:00:1728175569.253817      68 service.cc:145] XLA service 0x7d98d4004310 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728175569.253860      68 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1728175569.253864      68 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1728175572.056809      68 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Fold 1: Test Accuracy = 0.6047, Validation Accuracy = 0.4476
Fold 2: Test Accuracy = 0.5762, Validation Accuracy = 0.4170
Fold 3: Test Accuracy = 0.6265, Validation Accuracy = 0.4410
Fold 4: Test Accuracy = 0.5846, Validation Accuracy = 0.4619
Fold 5: Test Accuracy = 0.5185, Validation Accuracy = 0.4899
Average Test Accuracy across 5 folds: 0.5821, Average Validation Accuracy across 5 folds: 0.4515

Training for: AT2 - HIST - Attention Network - /kaggle/input/attention-embeddings-for-rsna/AT2_attention_embeddings_hist.csv


  results_df = pd.concat([results_df, result_row], ignore_index=True)


Fold 1: Test Accuracy = 0.6131, Validation Accuracy = 0.4503
Fold 2: Test Accuracy = 0.5528, Validation Accuracy = 0.5000
Fold 3: Test Accuracy = 0.5846, Validation Accuracy = 0.4947
Fold 4: Test Accuracy = 0.6583, Validation Accuracy = 0.5235
Fold 5: Test Accuracy = 0.5302, Validation Accuracy = 0.3953
Average Test Accuracy across 5 folds: 0.5878, Average Validation Accuracy across 5 folds: 0.4728

Training for: ST1 - GSL - Attention Network - /kaggle/input/attention-embeddings-for-rsna/ST1_attention_embeddings_gsl.csv
Fold 1: Test Accuracy = 0.5066, Validation Accuracy = 0.4423
Fold 2: Test Accuracy = 0.5409, Validation Accuracy = 0.5120
Fold 3: Test Accuracy = 0.3694, Validation Accuracy = 0.4277
Fold 4: Test Accuracy = 0.5820, Validation Accuracy = 0.4718
Fold 5: Test Accuracy = 0.5159, Validation Accuracy = 0.3604
Average Test Accuracy across 5 folds: 0.5030, Average Validation Accuracy across 5 folds: 0.4429

Training for: ST1 - HIST - Attention Network - /kaggle/input/attention-

Unnamed: 0,Combination,Avg_Test_Accuracy,Avg_Val_Accuracy,Best_Test_Accuracy,Best_Val_Accuracy
0,AT2 - GSL - Attention Network,0.582083,0.451487,0.626466,0.441039
1,AT2 - HIST - Attention Network,0.587782,0.472779,0.658291,0.523534
2,ST1 - GSL - Attention Network,0.502954,0.442867,0.582011,0.471825
3,ST1 - HIST - Attention Network,0.498658,0.397725,0.620053,0.424538
4,ST2 - GSL - Attention Network,0.485687,0.45733,0.602667,0.449333
5,ST2 - HIST - Attention Network,0.547942,0.457101,0.609043,0.585638
6,AT2 - GSL - Average ResNet50,0.53383,0.578722,0.609715,0.604606
7,AT2 - HIST - Average ResNet50,0.477177,0.530458,0.616415,0.619095
8,ST1 - GSL - Average ResNet50,0.448525,0.520543,0.550265,0.439021
9,ST1 - HIST - Average ResNet50,0.471737,0.530865,0.522427,0.59934


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def random_predictions(path):
    labels = pd.read_csv(path)

    id_cols = labels[['study_id', 'series_id']]
    cols_to_impute = labels.drop(columns=['study_id', 'series_id'])
    imputed_cols = cols_to_impute.apply(lambda x: x.fillna(x.mode()[0]))
    final_df = pd.concat([id_cols, imputed_cols], axis=1)

    id_cols = final_df[['study_id', 'series_id']]
    cols_to_encode = final_df.drop(columns=['study_id', 'series_id'])
    encoder = OneHotEncoder(sparse_output=False)
    encoded_cols = encoder.fit_transform(cols_to_encode)
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cols_to_encode.columns))
    final_df = pd.concat([id_cols, encoded_df], axis=1)

    Y_true = final_df.drop(columns=['study_id', 'series_id']).values

    np.random.seed(42)  # For reproducibility
    random_predictions = np.random.rand(*Y_true.shape)  # Random floats in [0.0, 1.0]

    threshold = np.mean(random_predictions)

    binary_predictions = (random_predictions > threshold).astype(float)

    accuracy = np.mean(np.all(binary_predictions == Y_true, axis=1))

    print(f"Random Predictions Accuracy: {accuracy:.4f}")
    print(f"Self-adjusting Threshold: {threshold:.4f}")

In [6]:
random_predictions('/kaggle/input/preprocessed-dataset/train_data_AT2.csv')
random_predictions('/kaggle/input/preprocessed-dataset/train_data_ST1.csv')
random_predictions('/kaggle/input/preprocessed-dataset/train_data_ST2.csv')

Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.4996
Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.4995
Random Predictions Accuracy: 0.0000
Self-adjusting Threshold: 0.5004


In [7]:
df = results_df

df['Network'] = df['Combination'].str.split(' - ').str[2]
df['Condition'] = df['Combination'].str.split(' - ').str[1]

avg_accuracy_df = df.groupby(['Network', 'Condition']).agg({
    'Avg_Test_Accuracy': 'mean',
    'Avg_Val_Accuracy': 'mean'
}).reset_index()

best_accuracy_df = df.groupby(['Network', 'Condition']).agg({
    'Best_Test_Accuracy': 'mean',
    'Best_Val_Accuracy': 'mean'
}).reset_index()


pivot_avg_accuracy_df = avg_accuracy_df.pivot(index='Network', columns='Condition', values=['Avg_Test_Accuracy', 'Avg_Val_Accuracy'])
pivot_avg_accuracy_df.columns = [f'Avg_{stat} | {cond}' for stat, cond in pivot_avg_accuracy_df.columns]

pivot_best_accuracy_df = best_accuracy_df.pivot(index='Network', columns='Condition', values=['Best_Test_Accuracy', 'Best_Val_Accuracy'])
pivot_best_accuracy_df.columns = [f'Best_{stat} | {cond}' for stat, cond in pivot_best_accuracy_df.columns]

pivot_avg_accuracy_df = pivot_avg_accuracy_df.sort_index()
pivot_best_accuracy_df = pivot_best_accuracy_df.sort_index()

In [8]:
print("Average of Average k-Fold Accuracies:")
pivot_avg_accuracy_df

Average of Average k-Fold Accuracies:


Unnamed: 0_level_0,Avg_Avg_Test_Accuracy | GSL,Avg_Avg_Test_Accuracy | HIST,Avg_Avg_Val_Accuracy | GSL,Avg_Avg_Val_Accuracy | HIST
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Attention Network,0.523575,0.544794,0.450561,0.442535
Average ResNet50,0.499114,0.482791,0.52897,0.518649
MedicalNet Network,0.532181,0.448081,0.526492,0.516924


In [9]:
print("Average of Best-Fold Accuracies:")
pivot_best_accuracy_df

Average of Best-Fold Accuracies:


Unnamed: 0_level_0,Best_Best_Test_Accuracy | GSL,Best_Best_Test_Accuracy | HIST,Best_Best_Val_Accuracy | GSL,Best_Best_Val_Accuracy | HIST
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Attention Network,0.603714,0.629129,0.454066,0.511237
Average ResNet50,0.611549,0.588503,0.542676,0.578056
MedicalNet Network,0.666824,0.680684,0.421487,0.526814
