<a href="https://colab.research.google.com/github/bhargav23/AI/blob/master/HMB_HSV4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

hms_harmful_brain_activity_classification_path = kagglehub.competition_download('hms-harmful-brain-activity-classification')
keras_efficientnetv2_keras_efficientnetv2_b2_imagenet_2_path = kagglehub.model_download('keras/efficientnetv2/Keras/efficientnetv2_b2_imagenet/2')

print('Data source import complete.')


Data source import complete.


In [None]:
!pip install keras-cv
import os
os.environ["KERAS_BACKEND"] = "jax"  # You can also use tensorflow or torch


In [None]:
# -*- coding: utf-8 -*-
"""HMS - Harmful Brain Activity Classification"""

# Install required libraries
!pip install keras-cv tensorflow joblib tqdm

# Import necessary libraries
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # Use TensorFlow backend
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"  # Enable GPU memory growth

import keras_cv
import keras
from keras import ops
import tensorflow as tf
import pandas as pd
import numpy as np
from glob import glob
from tqdm.notebook import tqdm
import joblib
import matplotlib.pyplot as plt

# Configuration
class CFG:
    verbose = 1  # Verbosity
    seed = 42  # Random seed
    preset = "efficientnetv2_b0_imagenet"  # Use a smaller model (EfficientNetV2-B0)
    image_size = [80, 60]  # Reduce input image size
    epochs_stage1 = 3  # Reduce epochs for stage 1
    epochs_stage2 = 5  # Reduce epochs for stage 2
    batch_size = 2  # Reduce batch size to save memory
    lr_mode = "cos"  # LR scheduler mode
    drop_remainder = True  # Drop incomplete batches
    num_classes = 6  # Number of classes
    fold = 0  # Which fold to set as validation data
    class_names = ['Seizure', 'LPD', 'GPD', 'LRDA', 'GRDA', 'Other']
    label2name = dict(enumerate(class_names))
    name2label = {v: k for k, v in label2name.items()}

# Set random seed for reproducibility
keras.utils.set_random_seed(CFG.seed)





In [None]:
# Dataset Paths
BASE_PATH = hms_harmful_brain_activity_classification_path
SPEC_DIR = "/tmp/dataset/hms-hbac"
os.makedirs(SPEC_DIR + '/train_spectrograms', exist_ok=True)
os.makedirs(SPEC_DIR + '/test_spectrograms', exist_ok=True)

# Metadata
df = pd.read_csv(f'{BASE_PATH}/train.csv')
df['eeg_path'] = f'{BASE_PATH}/train_eegs/' + df['eeg_id'].astype(str) + '.parquet'
df['spec_path'] = f'{BASE_PATH}/train_spectrograms/' + df['spectrogram_id'].astype(str) + '.parquet'
df['spec2_path'] = f'{SPEC_DIR}/train_spectrograms/' + df['spectrogram_id'].astype(str) + '.npy'
df['class_name'] = df.expert_consensus.copy()
df['class_label'] = df.expert_consensus.map(CFG.name2label)

test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
test_df['eeg_path'] = f'{BASE_PATH}/test_eegs/' + test_df['eeg_id'].astype(str) + '.parquet'
test_df['spec_path'] = f'{BASE_PATH}/test_spectrograms/' + test_df['spectrogram_id'].astype(str) + '.parquet'
test_df['spec2_path'] = f'{SPEC_DIR}/test_spectrograms/' + test_df['spectrogram_id'].astype(str) + '.npy'

# Convert `.parquet` to `.npy`
def process_spec(spec_id, split="train"):
    spec_path = f"{BASE_PATH}/{split}_spectrograms/{spec_id}.parquet"
    spec = pd.read_parquet(spec_path)
    spec = spec.fillna(0).values[:, 1:].T  # Fill NaN values with 0, transpose for (Time, Freq) -> (Freq, Time)
    spec = spec.astype("float32")
    np.save(f"{SPEC_DIR}/{split}_spectrograms/{spec_id}.npy", spec)

# Parallelize the processing using joblib
spec_ids = df["spectrogram_id"].unique()
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "train")
    for spec_id in tqdm(spec_ids, total=len(spec_ids))
)

test_spec_ids = test_df["spectrogram_id"].unique()
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "test")
    for spec_id in tqdm(test_spec_ids, total=len(test_spec_ids))
)



  0%|          | 0/11138 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# DataLoader
def build_augmenter(dim=CFG.image_size):
    augmenters = [
        keras_cv.layers.MixUp(alpha=2.0),
        keras_cv.layers.RandomCutout(height_factor=(1.0, 1.0), width_factor=(0.06, 0.1)),  # freq-masking
        keras_cv.layers.RandomCutout(height_factor=(0.06, 0.1), width_factor=(1.0, 1.0)),  # time-masking
    ]

    def augment(img, label):
        data = {"images": img, "labels": label}
        for augmenter in augmenters:
            if tf.random.uniform([]) < 0.5:
                data = augmenter(data, training=True)
        return data["images"], data["labels"]

    return augment


def build_decoder(with_labels=True, target_size=CFG.image_size):
    def decode_signal(path, offset=None):
        file_bytes = tf.io.read_file(path)
        sig = tf.io.decode_raw(file_bytes, tf.float32)
        sig = sig[1024:]  # Remove header tag

        # Pad sig to make its length a multiple of 400
        current_length = tf.shape(sig)[0]
        padding_needed = tf.math.floormod(-current_length, 400)
        sig = tf.pad(sig, [[0, padding_needed]])

        sig = tf.reshape(sig, [400, -1])


        if offset is not None:
            offset = offset // 2  # Only odd values are given
            sig = sig[:, offset:offset + 300]

            pad_size = tf.math.maximum(0, 300 - tf.shape(sig)[1])
            sig = tf.pad(sig, [[0, 0], [0, pad_size]])
            sig = tf.reshape(sig, [400, 300])

        sig = tf.clip_by_value(sig, tf.math.exp(-4.0), tf.math.exp(8.0))  # Avoid 0 in log
        sig = tf.math.log(sig)
        sig -= tf.math.reduce_mean(sig)
        sig /= tf.math.reduce_std(sig) + 1e-6
        sig = tf.tile(sig[..., None], [1, 1, 3])  # Mono channel to 3 channels
        return sig

    def decode_label(label):
        label = tf.one_hot(label, CFG.num_classes)
        label = tf.cast(label, tf.float32)
        label = tf.reshape(label, [CFG.num_classes])
        return label

    def decode_with_labels(path, offset=None, label=None):
        sig = decode_signal(path, offset)
        label = decode_label(label)
        return (sig, label)

    return decode_with_labels if with_labels else decode_signal


def build_dataset(paths, offsets=None, labels=None, batch_size=CFG.batch_size, cache=False,
                  decode_fn=None, augment_fn=None, augment=False, repeat=True, shuffle=1024,
                  cache_dir="", drop_remainder=False):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)

    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)

    if augment_fn is None:
        augment_fn = build_augmenter()

    AUTO = tf.data.experimental.AUTOTUNE
    slices = (paths, offsets) if labels is None else (paths, offsets, labels)

    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls=AUTO)
    ds = ds.cache(cache_dir) if cache else ds
    ds = ds.repeat() if repeat else ds
    if shuffle:
        ds = ds.shuffle(shuffle, seed=CFG.seed)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.map(augment_fn, num_parallel_calls=AUTO) if augment else ds
    ds = ds.prefetch(AUTO)
    return ds



In [None]:
# Learning Rate Scheduler
import math

def get_lr_callback(batch_size=CFG.batch_size, mode=CFG.lr_mode, epochs=CFG.epochs_stage1, plot=False):
    lr_start, lr_max, lr_min = 5e-5, 6e-6 * batch_size, 1e-5
    lr_ramp_ep, lr_sus_ep, lr_decay = 3, 0, 0.75

    def lrfn(epoch):  # Learning rate update function
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        elif mode == 'exp':
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step':
            lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # Plot lr curve if plot is True
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch')
        plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)



In [None]:
# Data Split
from sklearn.model_selection import StratifiedGroupKFold

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=CFG.seed)

df["fold"] = -1
df.reset_index(drop=True, inplace=True)
for fold, (train_idx, valid_idx) in enumerate(
        sgkf.split(df, y=df["class_label"], groups=df["patient_id"])
):
    df.loc[valid_idx, "fold"] = fold
df.groupby(["fold", "class_name"])[["eeg_id"]].count().T

# Build Train & Valid Dataset
sample_df = df.groupby("spectrogram_id").head(1).reset_index(drop=True)
train_df = sample_df[sample_df.fold != CFG.fold]
valid_df = sample_df[sample_df.fold == CFG.fold]

high_confidence_df = train_df[train_df.apply(lambda row: sum(row[['seizure_vote', 'lpd_vote', 'gpd_vote',
                                                                   'lrda_vote', 'grda_vote', 'other_vote']]), axis=1) >= 10]

low_confidence_df = train_df[train_df.apply(lambda row: sum(row[['seizure_vote', 'lpd_vote', 'gpd_vote',
                                                                  'lrda_vote', 'grda_vote', 'other_vote']]), axis=1) < 10]



In [None]:
# Import necessary libraries for metrics calculation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Evaluate the model on the validation dataset
def evaluate_model(model, valid_ds):
    # Get true labels and predictions
    y_true = []
    y_pred = []

    for batch in valid_ds:
        images, labels = batch
        preds = model.predict(images)
        y_true.extend(labels.numpy())  # True labels
        y_pred.extend(preds)  # Predicted probabilities

    # Convert predictions to class labels (argmax for multi-class)
    y_true = np.argmax(np.array(y_true), axis=1)
    y_pred = np.argmax(np.array(y_pred), axis=1)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')  # Weighted F1 for imbalanced datasets
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')

    # Print metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")



In [None]:
# Model Building
model = keras_cv.models.ImageClassifier.from_preset(
    CFG.preset, num_classes=CFG.num_classes
)

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
              loss=keras.losses.KLDivergence())



In [None]:
# Two-Stage Training
# Stage 1: High-confidence data
train_paths = high_confidence_df.spec2_path.values
train_offsets = high_confidence_df.spectrogram_label_offset_seconds.values.astype(int)
train_labels = high_confidence_df.class_label.values
train_ds = build_dataset(train_paths, train_offsets, train_labels, batch_size=CFG.batch_size,
                         repeat=True, shuffle=True, augment=True, cache=False)

valid_paths = valid_df.spec2_path.values
valid_offsets = valid_df.spectrogram_label_offset_seconds.values.astype(int)
valid_labels = valid_df.class_label.values
valid_ds = build_dataset(valid_paths, valid_offsets, valid_labels, batch_size=CFG.batch_size,
                         repeat=False, shuffle=False, augment=False, cache=False)

history_stage1 = model.fit(
    train_ds,
    epochs=CFG.epochs_stage1,
    callbacks=[get_lr_callback(CFG.batch_size, mode=CFG.lr_mode)],
    steps_per_epoch=len(high_confidence_df) // CFG.batch_size,
    validation_data=valid_ds,
    verbose=CFG.verbose
)



Epoch 1/3
[1m1662/1662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 46ms/step - loss: 1.3175 - val_loss: 1.7335 - learning_rate: 5.0000e-05
Epoch 2/3
[1m1662/1662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 47ms/step - loss: 1.1861 - val_loss: 1.6770 - learning_rate: 3.7333e-05
Epoch 3/3
[1m1662/1662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 41ms/step - loss: 1.1187 - val_loss: 1.6619 - learning_rate: 2.4667e-05


In [None]:
# After Stage 1 Training
print("Stage 1 Evaluation:")
evaluate_model(model, valid_ds)

In [None]:
# Stage 2: Full dataset
train_paths = train_df.spec2_path.values
train_offsets = train_df.spectrogram_label_offset_seconds.values.astype(int)
train_labels = train_df.class_label.values
train_ds = build_dataset(train_paths, train_offsets, train_labels, batch_size=CFG.batch_size,
                         repeat=True, shuffle=True, augment=True, cache=False)

history_stage2 = model.fit(
    train_ds,
    epochs=CFG.epochs_stage2,
    callbacks=[get_lr_callback(CFG.batch_size, mode=CFG.lr_mode)],
    steps_per_epoch=len(train_df) // CFG.batch_size,
    validation_data=valid_ds,
    verbose=CFG.verbose
)



Epoch 1/5
[1m4583/4583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 39ms/step - loss: 1.3392 - val_loss: 1.1816 - learning_rate: 5.0000e-05
Epoch 2/5
[1m4583/4583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 41ms/step - loss: 1.1859 - val_loss: 1.1292 - learning_rate: 3.7333e-05
Epoch 3/5
[1m4583/4583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 43ms/step - loss: 1.0995 - val_loss: 1.1265 - learning_rate: 2.4667e-05
Epoch 4/5
[1m4583/4583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 38ms/step - loss: 1.0492 - val_loss: 1.1269 - learning_rate: 1.2000e-05
Epoch 5/5
[1m4583/4583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 40ms/step - loss: 1.0275 - val_loss: 1.1495 - learning_rate: 1.1500e-05


In [None]:
# After Stage 2 Training
print("Stage 2 Evaluation:")
evaluate_model(model, valid_ds)

Stage 2 Evaluation:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
model.save_weights("best_model.weights.h5")

In [None]:
# Inference and Submission
model.load_weights("best_model.weights.h5")

test_paths = test_df.spec2_path.values
test_ds = build_dataset(test_paths, batch_size=min(CFG.batch_size, len(test_df)),
                        repeat=False, shuffle=False, cache=False, augment=False)

preds = model.predict(test_ds)

pred_df = test_df[["eeg_id"]].copy()
target_cols = [x.lower() + '_vote' for x in CFG.class_names]
pred_df[target_cols] = preds.tolist()

sub_df = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')
sub_df = sub_df[["eeg_id"]].copy()
sub_df = sub_df.merge(pred_df, on="eeg_id", how="left")
sub_df.to_csv("submission.csv", index=False)
sub_df.head()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.078013,0.017123,0.008233,0.103243,0.557143,0.236244
