# Harmful Brain Activity Detection

## Approach

Since the data is EEG and spectrogram data, we use signal processing methods for pre-processing and feature extraction and depending on the type of features we use an appropriate neural network for prediction

### Data Pre-Procssing

* **Band Pass Filter** 
* **ICA**  - Artifact removal such as eye blinks, heart beats, muscle movements,.etc

### Feature Extraction

* **Power Spectral Density** - time domain
* **FFT** - Frequency domain 

* **Waveform** - Time-Frequency Domain 


### Predictive Model

* CNN for FFT 
* Vision Transformer
* LSTM for Power Spectral Density

### Evaluation

* Kullback Liebler divergence

In [1]:
import dask
import multiprocessing
import keras
import os
import pandas as pd
import scipy
from scipy.signal import butter, lfilter
from mne.preprocessing import ICA
from scipy import signal
from matplotlib import pyplot as plt 
import multiprocessing as mp
import dask

In [2]:
# Path to files

path_eeg = "../../Research Project/hms-harmful-brain-activity-classification/train_eegs/"
path_spec = "../../Research Project/hms-harmful-brain-activity-classification/train_spectrograms/"
path_train_csv = "../../Research Project/hms-harmful-brain-activity-classification/train.csv"
path_test_csv = "../../Research Project/hms-harmful-brain-activity-classification/test.csv"

In [20]:
# setup train, validation and test folder structure for Modelling

if not os.path.isdir("train"): 
    os.makedirs("train")
    
if not os.path.isdir("validation"): 
    os.makedirs("validation")
    
if not os.path.isdir("test"):
    os.makedirs("test")

In [21]:
# create class folders inside train folder

df_train = pd.read_csv(path_train_csv)

for c in df_train['expert_consensus'].unique():
    if not os.path.isdir("train/"+str(c)):
        os.makedirs("train/"+str(c))
        
    if not os.path.isdir("validation/"+str(c)):
        os.makedirs("validation/"+str(c))

### Read EEG and Spectrogram files

In [6]:
# Function to read eeg and spectrogram data
def read_file(folder):
    files = [os.path.join(folder, file) for file in os.listdir(folder)]
    return files

## Data Pre-Processing

### Extract Labeled Subsample from consolidated EEG recording

In [7]:
# Get offsets for a particular eeg subsample from the consolidated eeg
def get_offset(eeg_id):
    """
    Func : Get offsets of a particular consildated eeg to retrieve
    every 50 second subsample from the  consolidated eeg data
    
    params: EEG data id
    
    return: list of offsets for a consolidated eeg dataframe, list of cla
    
    """
    global df_train
    
    _offsets = df_train[(df_train['eeg_id']==eeg_id)]['eeg_label_offset_seconds'].tolist()
    
    _class = df_train[(df_train['eeg_id']==eeg_id)]['expert_consensus'].tolist()
    
    return _offsets, _class

# Get subsample based on offset from eeg data
def Extract_Subsample(eeg_df, offset):
    """
    Extract subsample from the eeg data based on offset
    and return the 50 second long subsample dataframe
    """
    return eeg_df.iloc[offset:offset+50,:-1]
    

### Noise Removal - Bandpass Filter

We will use the Butterworth Bandpass Filter to remove noise from the EEG data

In [8]:
# Function to get Bandpass filter coefficients
def get_filter_Coeff(lowcut, highcut, freq, order):
    nyq_freq = 0.5 * freq # Nyquist Frequency
    low = lowcut / nyq_freq
    high = highcut / nyq_freq
    b, a = butter(order, [low, high], btype='band')
    return b,a


# Function to apply the filter 
def Bandpass_filter(data, lowcut, highcut, freq, order=5):
    b,a = get_filter_Coeff(lowcut, highcut, freq, order)
    fil_sig = lfilter(b, a, data)
    return fil_sig
    

### ICA - Independent Component Analysis

Performing ICA for artifact removal such as eye blinks, heart beats, muscle movements,.etc


First, we will transform the dataframe into an array suitable to perform ICA

In [22]:
# import mne
# import numpy as np

# sfreq = 200  # sampling frequency in Hz
# channel_names = filtered_df.columns.tolist()  # Get the column names as channel names
# channel_types = ['eeg'] * len(channel_names)  # Assuming all channels are EEG

# # Convert DataFrame to numpy array
# data = filtered_df.T.to_numpy()  # Transpose because MNE expects channels x times

# # Create an MNE Info object
# info = mne.create_info(ch_names=channel_names, sfreq=sfreq, ch_types=channel_types)

# # Convert to volts (from microvolts, if your data is in uV)
# # data = data / 1e6  # Scale if your data is not already in Volts (V)

# # Create RawArray
# raw = mne.io.RawArray(data, info)



Creating RawArray with float64 data, n_channels=19, n_times=50
    Range : 0 ... 49 =      0.000 ...     0.245 secs
Ready.


#### ICA - (WIP)

In [24]:
# sfreq = raw.info['sfreq']

# # Create and fit an ICA model
# ica = ICA(n_components=15, random_state=97, max_iter=800)
# ica.fit(raw)

# # Plot the components to manually identify artifacts
# ica.plot_components()

Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by number: 15 components


  ica.fit(raw)


Fitting ICA took 0.5s.




RuntimeError: No digitization points found.

## Feature Extraction

In [9]:
def normalize_array(arr):
    min_val = np.min(arr)
    max_val = np.max(arr)
    norm_arr = (arr - min_val) / (max_val - min_val) * (255 - 1) + 1
    return norm_arr


#### Power Spectral Density - time domain

#### FFT - Frequency domain


In [10]:
# Convert EEG to Spectrogram

def EEG_to_Spectrogram(df, path, eeg_Id):
    # generate spectrogram from eeg
    freqs, t, spec_df = signal.spectrogram(df, fs=200)
    
    # NOrmalize the values of the spectrogram array to help in generalization
    spec_df = normalize_array(spec_df)
    
    spectrogram = np.squeeze(spec_df, axis=-1).T  # Transpose and squeeze to reshape

    # Plot the spectrogram using Matplotlib
    plt.imshow(spectrogram, aspect='auto', cmap='inferno', origin='lower')
    # plt.colorbar(label='Power Spectral Density [dB]')
    plt.axis('off')
    # Save the spectrogram as an image
    plt.savefig(f"{path}/{eeg_Id}.png")

In [111]:
# import numpy as np
# from PIL import Image

# # Load the spectrogram image
# spectrogram_image = Image.open('spectrogram.png')

# # Convert the image to a NumPy array
# # spectrogram_array = np.array(spectrogram_image)

# # Display the shape of the array
# print(spectrogram_array.shape)

(480, 640, 4)


#### Waveform - Time-Frequency Domain

## Modelling

Based on the type of the Feature Extracted, the following models will be trained on the respective features and an ensemble of these models will be created for prediction/inference.

The models will be evaluated on the `Kullback Liebler divergence` Metric

**Predictive Models** :

1) CNN for FFT
2) LSTM for Power Spectral Density
3) Wavenets for Waveforms

project_root/
│
├── data/
│   ├── train/
│   │   ├── class1/
│   │   │   ├── image1.jpg
│   │   │   ├── image2.jpg
│   │   │   └── ...
│   │   ├── class2/
│   │   │   ├── image1.jpg
│   │   │   ├── image2.jpg
│   │   │   └── ...
│   │   └── ...
│   │
│   └── validation/
│       ├── class1/
│       │   ├── image1.jpg
│       │   ├── image2.jpg
│       │   └── ...
│       ├── class2/
│       │   ├── image1.jpg
│       │   ├── image2.jpg
│       │   └── ...
│       └── ...
│
├── models/
│   ├── cnn_model.py
│   └── ...
│
├── utils/
│   ├── data_preprocessing.py
│   └── ...
│
└── train.py


In [None]:
torch.cuda.is_available()

## Main Function

In [11]:
# EEG data frame
eeg_files = read_file(path_eeg)

# Read Spectrogram data
spec_files = read_file(path_spec)

# Read metadata
df_train = pd.read_csv(path_train_csv)
    

In [13]:
# constants for filter function
lowcut = 1
highcut = 40
freq = 200

# iterate over eeg files

for eeg in eeg_files:
    
    # read eed df
    df_eeg = pd.read_parquet(eeg)
    
    # get eeg id
    _id = int(eeg.split("/")[-1].split(".")[0])
    
    # get offsets for the eeg sample
    offsets_, classes = get_offset(eeg_id)
    
    # extract subsamples from consolidated eeg
    for offset, c in zip(offsets_, classes):
        df_sub = Extract_Subsample(df_eeg, offset)
        
        # filter the eeg sample
        filtered_df = df_sub.apply(lambda x: Bandpass_filter(x, lowcut, highcut, freq, order=5))
        
        # Convert eeg to spectrogram
        EEG_to_Spectrogram(df, path, eeg_Id)

4222573799
1622210937
678648250
2148256831
1469546440
3329232596
2718549792
2510219397
2436487142
2014107973
1037773381
413742710
2484641754
4099033954
2302302452
837639030
3854492726
3715407957
839230176
1573558442
1039058822
3945925489
1687123085
2015905115
3204007697
1809434330
3383534481
4285210475
2752736460
231432766
1765478799
4257527824
3859607471
3064792501
481358497
4010993438
2894643287
2132912634
1330263395
2740692471
2411672943
992798064
527970310
1655308883
2322119862
2573770198
175001942
3900459244
280907771
3184243660
4215350496
3751149087
3774430289
369912379
8922554
465677924
1504413365
2964873930
679047082
688446763
2090900070
3158461194
3485489953
3268523216
2320261988
1326562309
77987479
3630706891
3287904898
330358359
1791299141
2396110242
3500185798
3646330278
2530656361
3701641197
2539873802
3568435119
3572902772
2209965074
3552918494
1315636310
2495006719
4014410930
878032028
2701220687
1468936383
2941389142
1122434994
598783499
1198730722
4246314327
3279947709