In [1]:
wsi_labels_path = '/media/yanis/LaCie/Final_execution_files/train/kfolds_csv/'
features_path = '/media/yanis/LaCie/Final_execution_files/train/features_final/'

In [None]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def generate_stratified_kfold_splits(directory_path,output_path, k=5):
    filenames = [f for f in os.listdir(directory_path) if f.endswith('.pt')]
    slide_ids = ['_'.join(filename.split('_')[:2]) for filename in filenames]

    labels = []
    for slide_id in slide_ids:
        if 'normal' in slide_id:
            label = 'normal_tissue'
        elif 'tumor' in slide_id:
            label = 'tumor_tissue'
        labels.append(label)

    df = pd.DataFrame({'slide_id': slide_ids, 'label': labels})
    
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold = 1

    for train_index, val_index in skf.split(df, df['label']):
        train_df, val_df = df.iloc[train_index], df.iloc[val_index]
        
        train_df['split_membership'] = 'train'
        val_df['split_membership'] = 'val'
        
        fold_df = pd.concat([train_df, val_df]).sort_values(by='slide_id')
        
        fold_csv_output_path = os.path.join(output_path, f'fold_{fold}_split.csv')
        fold_df.to_csv(fold_csv_output_path, index=False)
        fold += 1

directory_path = features_path

generate_stratified_kfold_splits(directory_path,wsi_labels_path)


In [3]:
from torch.utils.data import Dataset
import pandas as pd
import torch

class WSI_dataset(Dataset):
    def __init__(self,wsi_list,features_path):
        self.wsi_list = wsi_list
        self.features_path = features_path
        self.label_to_int = {'normal_tissue': 0, 'tumor_tissue': 1}
    def __len__(self):
        return len(self.wsi_list)

    def __getitem__(self, idx):
        name = self.wsi_list[idx]['slide_id']
        label = torch.tensor(self.label_to_int[self.wsi_list[idx]['label']], dtype=torch.long)
        features = torch.load(self.features_path+name+'_features.pt')
        return (features,label)

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt

def plot_combined_loss(train_loss_values, val_loss_values, epochs):
    plt.figure(figsize=(10, 6))

    # Plot training loss
    plt.plot(epochs, train_loss_values, label='Train Loss', color='blue')

    # Plot validation loss
    plt.plot(epochs, val_loss_values, label='Validation Loss', color='red')

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss per Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

In [5]:
import pandas as pd

def load_all_csv_from_directory(directory_path):
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    csv_dfs = []

    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path)
        csv_dfs.append(df)

    return csv_dfs

In [8]:
import pandas as pd
from torch.utils.data import DataLoader
from models.model_clam import CLAM_SB
import torch
import torch.nn as nn
import torch.optim as optim
from topk.svm import SmoothTop1SVM
from sklearn.metrics import roc_curve, auc

folds_list = load_all_csv_from_directory(wsi_labels_path)
model_perf = []

for k,wsi_dataframe in enumerate(folds_list):
    print('Starting fold',k)
    wsi_df = wsi_dataframe
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_list = []
    val_list = []
    
    for _, row in wsi_df.iterrows():
        membership = row['split_membership']
        if membership == 'train':
            train_list.append(row)
        elif membership == 'val':
            val_list.append(row)
            
    train_dataset,val_dataset = WSI_dataset(train_list,features_path),WSI_dataset(val_list,features_path)
    batch_size = 1
    shuffle = True
    num_workers = 4
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    
    loss_fn = nn.CrossEntropyLoss()
    instance_loss_fn = SmoothTop1SVM(n_classes = 2).cuda()
    
    model_dict = {"dropout": True, 'n_classes': 2,"size_arg": 'small','k_sample': 8}
    
    model = CLAM_SB(**model_dict, instance_loss_fn=instance_loss_fn)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0002, weight_decay=1e-5)
    
    model.relocate() #met le modele sur le device (gpu)
    
    model.train()
    
    bag_weight = 0.7
    num_epochs = 50
    
    train_loss_list = []
    val_loss_list = []
    best_f1 = 0
    best_loss = float('inf')
    early_stopping_counter = 0
    for epoch in range(num_epochs):
        train_loss = 0.
        val_loss = 0.
        #print('starting epoch ',epoch)
        model.train()
        for index, (features, label) in enumerate(train_loader):
            #print('starting batch ',index)
            features = features.to(device)
            label = label.to(device)
            
            logits, Y_prob, pred, _, instance_dict = model(features, label=label, instance_eval=True)
            loss = loss_fn(logits, label)
            instance_loss = instance_dict['instance_loss']
            total_loss = bag_weight * loss + (1-bag_weight) * instance_loss 
            train_loss += loss.item()
            total_loss.backward()
        
            optimizer.step()
            optimizer.zero_grad()
        train_loss /= len(train_loader)
        #print('Epoch: {}, train_loss: {:.4f}'.format(epoch, train_loss))
        model.eval()
        all_preds_prob = []
        all_labels = []
        for index, (features, label) in enumerate(val_loader):
            #print('starting batch ',index)
            features = features.to(device)
            label = label.to(device)
            with torch.no_grad():
                logits, Y_prob, Y_hat, _, instance_dict = model(features, label=label, instance_eval=False)
                loss = loss_fn(logits, label)
                val_loss += loss.item()
                all_preds_prob.append(Y_prob[0][1].cpu())
                all_labels.append(label.cpu())

        val_loss /= len(val_loader)
        fpr, tpr, thresholds = roc_curve(all_labels, all_preds_prob)
        roc_auc = auc(fpr, tpr)
        #print('Epoch: {}, val_loss: {:.4f}, AUC: {:.2f}, accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}, f1: {:.2f}'.format(epoch, val_loss, roc_auc,total_acc, precision, recall, f1))
        if val_loss < best_loss:
                model_path = './CLAM_fold_'+str(k)+'.pt'
                torch.save(model.state_dict(), model_path)
                best_roc_auc = roc_auc
                best_loss = val_loss
                best_epoch = epoch
                early_stopping_counter = 0
        
        else:
            early_stopping_counter+=1
            if(early_stopping_counter>5):
                print("Loss hasn't improved for more than 5 epochs - early stopping at epoch",epoch)
                break
    
           
    print('Model '+str(k)+' done, Best epoch : ',best_epoch,'\nLoss from best epoch:',best_loss,'\nROC-AUC from best epoch : ',best_roc_auc)
    model_perf.append([best_loss,best_roc_auc])
    #plot_combined_loss(train_loss_list, val_loss_list, range(0,epoch+1))

Starting fold 0
Setting tau to 1.0
Loss hasn't improved for more than 5 epochs - early stopping at epoch 25
Model 0done, Best epoch :  19 
,Loss from best epoch: 0.4887163035571575 
ROC-AUC from best epoch :  0.83
Starting fold 1
Setting tau to 1.0
Loss hasn't improved for more than 5 epochs - early stopping at epoch 33
Model 1done, Best epoch :  27 
,Loss from best epoch: 0.2955414322670549 
ROC-AUC from best epoch :  0.9500000000000001
Starting fold 2
Setting tau to 1.0
Loss hasn't improved for more than 5 epochs - early stopping at epoch 22
Model 2done, Best epoch :  16 
,Loss from best epoch: 0.5497371047735214 
ROC-AUC from best epoch :  0.81
Starting fold 3
Setting tau to 1.0
Loss hasn't improved for more than 5 epochs - early stopping at epoch 26
Model 3done, Best epoch :  20 
,Loss from best epoch: 0.3736212524144273 
ROC-AUC from best epoch :  0.9444444444444444
Starting fold 4
Setting tau to 1.0
Loss hasn't improved for more than 5 epochs - early stopping at epoch 24
Model 4d

In [9]:
model_perf

[[0.4887163035571575, 0.83],
 [0.2955414322670549, 0.9500000000000001],
 [0.5497371047735214, 0.81],
 [0.3736212524144273, 0.9444444444444444],
 [0.4527982924329607, 0.9111111111111112]]

In [12]:
min_first_element_index = min(range(len(model_perf)), key=lambda i: model_perf[i][0])
print("Model with the smallest loss : model ", min_first_element_index)

mean_second_elements = sum(sublist[1] for sublist in model_perf) / len(model_perf)
print("Mean ROC-AUC: {:.2f}".format(mean_second_elements))

Model with the smallest loss : model  1
Mean ROC-AUC: 0.89
