In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
from scipy.fft import fft, fftfreq

def fourier_transform(signals, sampling_rate):
    """
    Perform Fourier Transform on the given signal.
    """
    for signal in signals:
        fft_result = np.abs(fft(signal)) # Get rid of imaginary values
        frequencies = fftfreq(len(signal), 1 / sampling_rate)  # Frequency bins
        fft_magnitudes = fft_result[:len(fft_result)//2]
        frequencies = frequencies[:len(frequencies)//2]
        
    return fft_magnitudes, frequencies

from scipy.signal import find_peaks

def extract_channel_features(i, signal, sampling_rate):
    """Extract specified features for a single channel."""
    # FFT computation
    fft_result, frequencies = fourier_transform([signal], sampling_rate)
    
    # Keep only positive frequencies
    positive_frequencies = frequencies > 0
    fft_result = fft_result[positive_frequencies]
    frequencies = frequencies[positive_frequencies]
    
    # Dominant frequency and amplitude
    dominant_idx = np.argmax(fft_result)
    dominant_frequency = frequencies[dominant_idx]
    dominant_amplitude = fft_result[dominant_idx]
    
    # Total power
    total_power = np.sum(fft_result**2)/len(fft_result)
    
    # Spectral centroid
    spectral_centroid = np.sum(frequencies * fft_result) / np.sum(fft_result)
    
    # Spectral bandwidth
    spectral_bandwidth = np.sqrt(np.sum(((frequencies - spectral_centroid)**2) * fft_result) / np.sum(fft_result))
    
    # Shannon entropy
    spectral_prob = fft_result / np.sum(fft_result)
    spectral_entropy = -np.sum(spectral_prob * np.log2(spectral_prob + 1e-12))  # Adding a small value for numerical stability
    
    # Frequency variance
    frequency_variance = np.var(fft_result)
    
    # Zero-crossing rate (ZCR) of the original signal
    zcr = np.sum(np.sign(np.diff(signal)) != 0)/len(signal)

    return {
        "Channel":i,
        "Dominant_Frequency": dominant_frequency,
        "Dominant_Amplitude": dominant_amplitude,
        "Total_Power": total_power,
        "Spectral_Centroid": spectral_centroid,
        "Spectral_Bandwidth": spectral_bandwidth,
        "Spectral_Entropy": spectral_entropy,
        "Frequency_Variance": frequency_variance,
        "Zero_Crossing_Rate": zcr
    }

def signal_features(signals, sampling_rate):
    features = []
    for i in range(len(signals)):
        signal = signals[i]
        features.append(extract_channel_features(i+1, signal, sampling_rate))
    return pd.DataFrame(features)

def flatten_features(features_df):
    flattened_features = {}
    for column in features_df.columns:
        if column != "Channel":  # Skip the Channel column as it's used for indexing
            for idx, value in enumerate(features_df[column]):
                key = f"{column}_Ch{idx + 1}"
                flattened_features[key] = value
    
    return pd.DataFrame([flattened_features])


In [21]:
# Get .npy files from a class folder and make a dataframe containing its Fourier features
def features_from_folder(folder_path, sampling_rate):

    all_features = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".npy"):  
            file_path = os.path.join(folder_path, filename)
            signals = np.load(file_path) 
                
            sig = signal_features(signals, sampling_rate)
            flat = flatten_features(sig)
            all_features.append(flat)
    
    features_df = pd.concat(all_features,ignore_index=True)
    
    return features_df


In [22]:
# Training Data

cps_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/train_data/Complex_Partial_Seizures", 1000)
cps_df["Label"] = 1
normal_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/train_data/Normal", 1000)
normal_df["Label"] = 0
es_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/train_data/Electrographic_Seizures", 1000)
es_df["Label"] = 2
vds_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/train_data/Video_detected_Seizures_with_no_visual_change_over_EEG", 1000)
vds_df["Label"] = 3

train_data = pd.concat([cps_df, normal_df, es_df, vds_df], ignore_index=True)
train_data = train_data.sample(frac=1).reset_index(drop=True)
X_train = train_data.drop("Label", axis=1)
y_train = train_data["Label"]

In [23]:
X_train

Unnamed: 0,Dominant_Frequency_Ch1,Dominant_Frequency_Ch2,Dominant_Frequency_Ch3,Dominant_Frequency_Ch4,Dominant_Frequency_Ch5,Dominant_Frequency_Ch6,Dominant_Frequency_Ch7,Dominant_Frequency_Ch8,Dominant_Frequency_Ch9,Dominant_Frequency_Ch10,...,Zero_Crossing_Rate_Ch10,Zero_Crossing_Rate_Ch11,Zero_Crossing_Rate_Ch12,Zero_Crossing_Rate_Ch13,Zero_Crossing_Rate_Ch14,Zero_Crossing_Rate_Ch15,Zero_Crossing_Rate_Ch16,Zero_Crossing_Rate_Ch17,Zero_Crossing_Rate_Ch18,Zero_Crossing_Rate_Ch19
0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,0.986,0.986,0.990,0.990,0.994,0.996,0.990,0.992,0.994,0.984
1,6.0,6.0,2.0,8.0,6.0,8.0,8.0,2.0,8.0,2.0,...,0.988,0.980,0.994,0.988,0.994,0.990,0.998,0.994,0.992,0.998
2,6.0,6.0,6.0,6.0,4.0,6.0,6.0,4.0,4.0,4.0,...,0.984,0.984,0.990,0.974,0.988,0.990,0.988,0.984,0.992,0.986
3,4.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,...,0.984,0.980,0.986,0.992,0.996,0.976,0.980,0.992,0.982,0.990
4,4.0,2.0,4.0,4.0,4.0,2.0,2.0,4.0,4.0,2.0,...,0.954,0.962,0.976,0.974,0.990,0.974,0.970,0.968,0.976,0.978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5603,10.0,10.0,4.0,6.0,10.0,10.0,10.0,6.0,4.0,4.0,...,0.946,0.958,0.974,0.978,0.966,0.972,0.954,0.978,0.978,0.976
5604,8.0,2.0,2.0,2.0,4.0,2.0,2.0,6.0,4.0,2.0,...,0.822,0.942,0.946,0.926,0.916,0.960,0.944,0.914,0.942,0.966
5605,4.0,2.0,4.0,4.0,6.0,2.0,2.0,2.0,2.0,4.0,...,0.978,0.974,0.990,0.992,0.992,0.988,0.978,0.986,0.992,0.980
5606,6.0,14.0,6.0,2.0,2.0,4.0,4.0,4.0,2.0,4.0,...,0.970,0.986,0.980,0.980,0.980,0.980,0.992,0.978,0.982,0.982


In [24]:
# Validation Data

cps_val_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/validation_data/Complex_Partial_Seizures", 1000)
cps_val_df["Label"] = 1
normal_val_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/validation_data/Normal", 1000)
normal_val_df["Label"] = 0
es_val_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/validation_data/Electrographic_Seizures", 1000)
es_val_df["Label"] = 2
vds_val_df = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/validation_data/Video_detected_Seizures_with_no_visual_change_over_EEG", 1000)
vds_val_df["Label"] = 3

val_data = pd.concat([cps_val_df, normal_val_df, es_val_df, vds_val_df], ignore_index=True)
val_data = val_data.sample(frac=1).reset_index(drop=True)

X_val = val_data.drop("Label", axis=1)
y_val = val_data["Label"]

In [25]:
X_val

Unnamed: 0,Dominant_Frequency_Ch1,Dominant_Frequency_Ch2,Dominant_Frequency_Ch3,Dominant_Frequency_Ch4,Dominant_Frequency_Ch5,Dominant_Frequency_Ch6,Dominant_Frequency_Ch7,Dominant_Frequency_Ch8,Dominant_Frequency_Ch9,Dominant_Frequency_Ch10,...,Zero_Crossing_Rate_Ch10,Zero_Crossing_Rate_Ch11,Zero_Crossing_Rate_Ch12,Zero_Crossing_Rate_Ch13,Zero_Crossing_Rate_Ch14,Zero_Crossing_Rate_Ch15,Zero_Crossing_Rate_Ch16,Zero_Crossing_Rate_Ch17,Zero_Crossing_Rate_Ch18,Zero_Crossing_Rate_Ch19
0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,...,0.984,0.992,0.996,0.994,0.990,0.990,0.994,0.994,0.988,0.996
1,8.0,10.0,4.0,6.0,10.0,10.0,10.0,4.0,4.0,2.0,...,0.962,0.968,0.984,0.956,0.982,0.958,0.978,0.978,0.976,0.974
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,24.0,6.0,...,0.936,0.966,0.972,0.982,0.972,0.966,0.978,0.980,0.974,0.980
3,8.0,10.0,8.0,8.0,8.0,12.0,10.0,8.0,8.0,8.0,...,0.992,0.992,0.996,0.990,0.982,0.990,0.996,0.336,0.992,0.994
4,2.0,6.0,2.0,4.0,2.0,6.0,6.0,4.0,4.0,8.0,...,0.966,0.960,0.984,0.972,0.992,0.992,0.974,0.992,0.976,0.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398,10.0,6.0,8.0,6.0,6.0,6.0,4.0,6.0,6.0,6.0,...,0.982,0.978,0.988,0.986,0.992,0.980,0.980,0.978,0.968,0.982
1399,10.0,6.0,4.0,4.0,6.0,6.0,14.0,10.0,6.0,6.0,...,0.976,0.988,0.988,0.996,0.980,0.972,0.980,0.988,0.990,0.982
1400,4.0,4.0,4.0,6.0,6.0,4.0,4.0,4.0,4.0,6.0,...,0.942,0.978,0.970,0.978,0.974,0.970,0.980,0.986,0.972,0.978
1401,4.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,0.966,0.964,0.984,0.984,0.980,0.968,0.968,0.978,0.986,0.970


In [26]:
# Test Data
test_data = features_from_folder("/kaggle/input/eeg-datas/EEG_Data/test_data", 1000)
test_data.shape

(779, 152)

BASELINE MODEL -> SVM

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

model = SVC(C=100, kernel='rbf', class_weight='balanced', probability = True)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)


print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       696
           1       0.95      0.85      0.90       549
           2       0.78      0.87      0.82       137
           3       0.90      0.90      0.90        21

    accuracy                           0.90      1403
   macro avg       0.88      0.89      0.88      1403
weighted avg       0.90      0.90      0.90      1403



In [28]:
from sklearn.metrics import balanced_accuracy_score

# Compute the balanced accuracy score
score = balanced_accuracy_score(y_val, y_pred)

print(f"Balanced Accuracy Score: {score:.2f}")


Balanced Accuracy Score: 0.89


In [29]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Binarize the true labels for multi-class
y_true_binarized = label_binarize(y_val, classes=[0, 1, 2, 3])

# Predict probabilities for each class
y_scores = model.predict_proba(X_val)  # Returns a 2D array of shape (n_samples, n_classes)

# Compute ROC AUC score (macro average)
roc_auc = roc_auc_score(y_true_binarized, y_scores, multi_class="ovr", average="macro")

print(f"ROC AUC Score (Macro): {roc_auc:.2f}")


ROC AUC Score (Macro): 0.98
