## 📙 Import libraries and modules

In [1]:
# Importing essential libraries
import gc
import os
import random
import warnings
import numpy as np
import pandas as pd
from IPython.display import display

# PyTorch for deep learning
import timm
import torch
import torch.nn as nn  
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

# torchvision for image processing and augmentation
import torchvision.transforms as transforms

# Suppressing minor warnings to keep the output clean
warnings.filterwarnings('ignore', category=Warning)

# Reclaim memory no longer in use.
gc.collect()

46

## ⚙️ Configuration

In [2]:
# Configuration class containing hyperparameters and settings
class Config:
    seed = 42 
    image_transform = transforms.Resize((512,512))  
    batch_size = 16
    num_epochs = 9
    num_folds = 5

# Set the seed for reproducibility across multiple libraries
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
set_seed(Config.seed)

# Define the 'Kullback Leibler Divergence' loss function
def KL_loss(p,q):
    epsilon=10**(-15)
    p=torch.clip(p,epsilon,1-epsilon)
    q = nn.functional.log_softmax(q,dim=1)
    return torch.mean(torch.sum(p*(torch.log(p)-q),dim=1))

# Reclaim memory no longer in use.
gc.collect()

0

## 📂 Data Loading

In [3]:
# Load training data
train_df = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv")

# Define labels for classification
labels = ['seizure', 'lpd', 'gpd', 'lrda', 'grda', 'other']

# Initialize an empty DataFrame for storing features
train_feats = pd.DataFrame()

# Aggregate votes for each label and merge into train_feats DataFrame
for label in labels:
    # Group by 'spectrogram_id' and sum the votes for the current label
    group = train_df[f'{label}_vote'].groupby(train_df['spectrogram_id']).sum()

    # Create a DataFrame from the grouped data
    label_vote_sum = pd.DataFrame({'spectrogram_id': group.index, f'{label}_vote_sum': group.values})

    # Initialize train_feats with the first label or merge subsequent labels
    if label == 'seizure':
        train_feats = label_vote_sum
    else:
        train_feats = train_feats.merge(label_vote_sum, on='spectrogram_id', how='left')


# Calculate the total votes for each entry
train_feats['total_vote'] = train_feats[[f'{label}_vote_sum' for label in labels]].sum(axis=1)

# Normalize the votes for each label
for label in labels:
    train_feats[f'{label}_vote'] = train_feats[f'{label}_vote_sum'] / train_feats['total_vote']

# Add a column with the path to the spectrogram files
train_feats['path'] = train_feats['spectrogram_id'].apply(lambda x: f"/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{x}.parquet")

# Split the dataset based on the total votes
train_feats_less10 = train_feats[train_feats['total_vote'] < 10].reset_index(drop=True)
train_feats_more10 = train_feats[train_feats['total_vote'] >= 10].reset_index(drop=True)

# Ensure you no longer use train_df directly for model training, 

## 🎰 Data Preprocessing

In [4]:
def get_batch(paths, batch_size=Config.batch_size):
    # Set a small epsilon to avoid division by zero
    eps = 1e-6

    # Initialize a list to store batch data
    batch_data = []

    # Iterate over each path in the provided paths
    for path in paths:
        # Read data from parquet file
        data = pd.read_parquet(path[0])

        # Fill missing values, remove time column, and transpose
        data = data.fillna(-1).values[:, 1:].T

        # Clip values and apply logarithmic transformation
        data = np.clip(data, np.exp(-6), np.exp(10))
        data = np.log(data)

        # Normalize the data
        data_mean = data.mean(axis=(0, 1))
        data_std = data.std(axis=(0, 1))
        data = (data - data_mean) / (data_std + eps)

        # Convert data to a PyTorch tensor and apply transformations
        data_tensor = torch.unsqueeze(torch.Tensor(data), dim=0)
        data = Config.image_transform(data_tensor)

        # Append the processed data to the batch_data list
        batch_data.append(data)

    # Stack all the batch data into a single tensor
    batch_data = torch.stack(batch_data)

    # Return the batch data
    return batch_data

## 🤖 Model Training

In [5]:
def train_and_save_models(train_feats, folder_name):
    # Ensure the folder exists
    os.makedirs(folder_name, exist_ok=True)

    # Determine device availability
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    total_idx = np.arange(len(train_feats))
    np.random.shuffle(total_idx)

    gc.collect()

    # Cross-validation loop
    for fold in range(Config.num_folds):
        print(f"Starting training for fold {fold + 1}")
        # Split data into train and test sets for this fold
        test_idx = total_idx[fold * len(total_idx) // Config.num_folds:(fold + 1) * len(total_idx) // Config.num_folds]
        train_idx = np.array([idx for idx in total_idx if idx not in test_idx])

        # Initialize ResNet34d model with pretrained weights
        model = timm.create_model('resnet34d', pretrained=True, num_classes=6, in_chans=1)
        model.to(device)

        optimizer = optim.AdamW(model.parameters(), lr=0.001, betas=(0.5, 0.999), weight_decay=0.01)
        scheduler = CosineAnnealingLR(optimizer, T_max=Config.num_epochs)

        best_test_loss = float('inf')
        train_losses = []
        test_losses = []

        # Training loop
        for epoch in range(Config.num_epochs):
            model.train()
            train_loss = []
            random_num = np.random.permutation(len(train_idx))
            train_idx = train_idx[random_num]

            # Iterate over batches in the training set
            for idx in range(0, len(train_idx), Config.batch_size):
                optimizer.zero_grad()
                train_idx_batch = train_idx[idx:idx + Config.batch_size]
                train_X_batch_paths = train_feats[['path']].iloc[train_idx_batch].values
                train_X_batch = get_batch(train_X_batch_paths, batch_size=Config.batch_size)
                train_y_batch = train_feats[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].iloc[train_idx_batch].values
                train_y_batch = torch.tensor(train_y_batch, dtype=torch.float32)

                train_pred = model(train_X_batch.to(device))
                loss = KL_loss(train_y_batch.to(device), train_pred)
                loss.backward()
                optimizer.step()
                train_loss.append(loss.item())

            epoch_train_loss = np.mean(train_loss)
            train_losses.append(epoch_train_loss)
            print(f"Epoch {epoch + 1}: Train Loss = {epoch_train_loss:.2f}")

            scheduler.step()

            # Evaluation loop
            model.eval()
            test_loss = []
            with torch.no_grad():
                for idx in range(0, len(test_idx), Config.batch_size):
                    test_idx_batch = test_idx[idx:idx + Config.batch_size]
                    test_X_batch_paths = train_feats[['path']].iloc[test_idx_batch].values
                    test_X_batch = get_batch(test_X_batch_paths, batch_size=Config.batch_size)
                    test_y_batch = train_feats[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].iloc[test_idx_batch].values
                    test_y_batch = torch.tensor(test_y_batch, dtype=torch.float32)

                    test_pred = model(test_X_batch.to(device))
                    loss = KL_loss(test_y_batch.to(device), test_pred)
                    test_loss.append(loss.item())

            epoch_test_loss = np.mean(test_loss)
            test_losses.append(epoch_test_loss)
            print(f"Epoch {epoch + 1}: Test Loss = {epoch_test_loss:.2f}")

            # Save the model if it has the best test loss so far
            if epoch_test_loss < best_test_loss:
                best_test_loss = epoch_test_loss
                torch.save(model.state_dict(), os.path.join(folder_name, f"resnet34d_fold{fold}.pth"))

            gc.collect()

        print(f"Fold {fold + 1} Best Test Loss: {best_test_loss:.2f}")

In [6]:
# Train and save models for the dataset with less than 10 total votes
train_and_save_models(train_feats_less10, 'models_less10')

# Train and save models for the dataset with more than 10 total votes
train_and_save_models(train_feats_more10, 'models_more10')

Using device: cuda
Starting training for fold 1


model.safetensors:   0%|          | 0.00/87.4M [00:00<?, ?B/s]

Epoch 1: Train Loss = 1.05
Epoch 1: Test Loss = 0.96
Epoch 2: Train Loss = 0.88
Epoch 2: Test Loss = 0.75
Epoch 3: Train Loss = 0.77
Epoch 3: Test Loss = 0.77
Epoch 4: Train Loss = 0.70
Epoch 4: Test Loss = 0.73
Epoch 5: Train Loss = 0.59
Epoch 5: Test Loss = 0.81
Epoch 6: Train Loss = 0.49
Epoch 6: Test Loss = 0.72
Epoch 7: Train Loss = 0.37
Epoch 7: Test Loss = 0.79
Epoch 8: Train Loss = 0.24
Epoch 8: Test Loss = 0.72
Epoch 9: Train Loss = 0.16
Epoch 9: Test Loss = 0.69
Fold 1 Best Test Loss: 0.69
Starting training for fold 2
Epoch 1: Train Loss = 1.08
Epoch 1: Test Loss = 1.15
Epoch 2: Train Loss = 0.86
Epoch 2: Test Loss = 0.87
Epoch 3: Train Loss = 0.76
Epoch 3: Test Loss = 0.74
Epoch 4: Train Loss = 0.69
Epoch 4: Test Loss = 0.78
Epoch 5: Train Loss = 0.61
Epoch 5: Test Loss = 0.78
Epoch 6: Train Loss = 0.52
Epoch 6: Test Loss = 0.69
Epoch 7: Train Loss = 0.41
Epoch 7: Test Loss = 0.69
Epoch 8: Train Loss = 0.28
Epoch 8: Test Loss = 0.70
Epoch 9: Train Loss = 0.20
Epoch 9: Test L