In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
import csv

Mounted at /content/drive


# Extract data

In [None]:
# -----------------------------------------------------------
# Loading Data
# -----------------------------------------------------------

# Path
data_path = '/content/drive/MyDrive/Colab_Notebooks/Projet_Owkin/'
train_features_dir = os.path.join(data_path, "train_input", "moco_features")
test_features_dir = os.path.join(data_path, "test_input", "moco_features")
train_output_path = os.path.join(data_path, "train_output.csv")

y_train = pd.read_csv(train_output_path)['Target'].values
npy_files_train = sorted(glob.glob(os.path.join(train_features_dir, '*.npy')))

X_train_bags = []
for npy_file in npy_files_train:
    loaded_data = np.load(npy_file)
    features = loaded_data[:, 3:]
    X_train_bags.append(features)

X_train_bags = np.array(X_train_bags)

print(f"Forme des données d'entraînement (sacs) : {X_train_bags.shape}")
print(f"Forme des labels d'entraînement : {y_train.shape}")

# Features Normalization

flat_features_train = X_train_bags.reshape(-1, X_train_bags.shape[-1])
mean_train = np.mean(flat_features_train, axis=0)
std_train = np.std(flat_features_train, axis=0)

X_train_bags = (X_train_bags - mean_train) / std_train

# Training

In [None]:
# Encoder
def create_tile_encoder(input_shape=(2048,)):
    model = Sequential([
        layers.Input(shape=input_shape),
        layers.Reshape((input_shape[0], 1)),
        layers.Conv1D(8, 4, activation='relu', strides=8),
        layers.Flatten()
    ], name='tile_encoder')
    return model


def create_mil_model():
    """Crée le modèle MIL complet pour la classification des sacs."""
    tile_encoder = create_tile_encoder()
    bag_input = layers.Input(shape=(1000, 2048), name='bag_input')
    tile_features = layers.TimeDistributed(tile_encoder, name='tile_features_extractor')(bag_input)

    # Decoder
    bag_features = layers.GlobalMaxPool1D(name='pooling_layer')(tile_features)

    dropout_features = layers.Dropout(0.45, name='dropout_layer')(bag_features)

    output = layers.Dense(1, activation='sigmoid', name='output_layer')(dropout_features)
    mil_model = Model(inputs=bag_input, outputs=output, name='MIL_Model_m6Anet_style')

    mil_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return mil_model

mil_model = create_mil_model()
mil_model.summary()

# Split of the data
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(
    X_train_bags, y_train, test_size=0.2, stratify=y_train)

print("\n-----------------------------------------------------------")
print("Début de l'entraînement du modèle MIL")

checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_epoch_{epoch:02d}.keras',
    monitor='val_AUC',
    save_best_only=False,
    mode='max',
    verbose=1
)

early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=15,
    restore_best_weights=True,
    monitor='val_AUC',
    mode='max',
    verbose=1
)

history = mil_model.fit(
    X_train_val, y_train_val,
    epochs=100,
    batch_size=4,
    validation_data=(X_test_val, y_test_val),
    verbose=1,
    callbacks=[checkpoint_cb, early_stopping_cb]
)

# Inference on the val set
loss, auc = mil_model.evaluate(X_test_val, y_test_val, verbose=0)
print(f"Score AUC sur le jeu de validation : {auc:.4f}")


-----------------------------------------------------------
Début de l'entraînement du modèle MIL
Epoch 1/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUC: 0.5572 - loss: 1.5258  
Epoch 1: saving model to model_epoch_01.keras
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 2s/step - AUC: 0.5570 - loss: 1.5278 - val_AUC: 0.5881 - val_loss: 1.2676
Epoch 2/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - AUC: 0.5635 - loss: 1.4280
Epoch 2: saving model to model_epoch_02.keras
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - AUC: 0.5636 - loss: 1.4291 - val_AUC: 0.6382 - val_loss: 0.7411
Epoch 3/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - AUC: 0.5542 - loss: 1.2654
Epoch 3: saving model to model_epoch_03.keras
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - AUC: 0.5547 - loss: 1.2653 - val_AUC: 0.6749 - val_loss: 0.962

In [None]:
# Loading of the best model with less overfitting
mil_model = keras.models.load_model('model_epoch_08.keras')

# Inference

In [None]:
# Loading the data
npy_files_owkin_test = sorted(glob.glob(os.path.join(test_features_dir, '*.npy')))

X_owkin_test_bags = []
owkin_test_file_names = []
for npy_file in npy_files_owkin_test:
    loaded_data = np.load(npy_file)
    features = loaded_data[:, 3:]
    X_owkin_test_bags.append(features)
    file_name = os.path.basename(npy_file).replace('.npy', '')
    owkin_test_file_names.append(file_name)

X_owkin_test_bags = np.array(X_owkin_test_bags)
print(f"Forme des données de test Owkin : {X_owkin_test_bags.shape}")

# Normalization
X_owkin_test_bags = (X_owkin_test_bags - mean_train) / std_train

final_predictions = mil_model.predict(X_owkin_test_bags)

final_csv_data = []
for file_name, prediction in zip(owkin_test_file_names, final_predictions.flatten()):
    final_csv_data.append([file_name + '.npy', prediction])

final_csv_data.sort(key=lambda x: x[0])

csv_file_path = 'y_test_mil_submission.csv'
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Sample ID', 'Target'])
    writer.writerows(final_csv_data)

from google.colab import drive
drive.mount('/content/drive')print(f"\nFichier de soumission {csv_file_path} a été exporté")