# Harmful Brain Activity Detection

In [1]:
import dask
import os
import pandas as pd
import scipy
from scipy.signal import butter, lfilter
from scipy import signal
from matplotlib import pyplot as plt 
import multiprocessing as mp
import numpy as np
import time
from dask.distributed import Client
import dask.bag as db
from sklearn.model_selection import train_test_split
import shutil
import time
import warnings
from scipy.signal import stft

In [2]:
num_cpus = 24

In [3]:
train_eeg_path = "/scratch/kunigalharish.a/hms-brain-activity-classification/dataset/train_eegs/"
train_csv_path = "/scratch/kunigalharish.a/hms-brain-activity-classification/dataset/train.csv"

In [4]:
df_train = pd.read_csv(train_csv_path)

In [5]:
# create class folders inside dataset folder

for c in df_train['expert_consensus'].unique():
    if not os.path.isdir("dataset_24/"+str(c)):
        os.makedirs("dataset_24/"+str(c))
        

In [6]:
# Function to read eeg and spectrogram data
def read_file(folder):
    files = [os.path.join(folder, file) for file in os.listdir(folder)]
    return files

## Data Pre-Processing

### Noise Removal - Band Pass Filter


In [7]:

# Get offsets for a particular eeg subsample from the consolidated eeg
def get_offset(eeg_id):
    """
    Func : Get offsets of a particular consildated eeg to retrieve
    every 50 second subsample from the  consolidated eeg data
    
    params: EEG data id
    
    return: list of offsets for a consolidated eeg dataframe, list of classes
    
    """
    global df_train
    
    _offsets = df_train[(df_train['eeg_id']==eeg_id)]['eeg_label_offset_seconds'].tolist()
    
    _class = df_train[(df_train['eeg_id']==eeg_id)]['expert_consensus'].tolist()
    
    return _offsets, _class

# Get subsample based on offset from eeg data
def Extract_Subsample(eeg_df, offset):
    """
    Extract subsample from the eeg data based on offset
    and return the 50 second long subsample dataframe
    """
    return eeg_df.iloc[offset:offset+50,:-1]
    

In [8]:
# Function to get Bandpass filter coefficients
def get_filter_Coeff(lowcut, highcut, freq, order):
    nyq_freq = 0.5 * freq # Nyquist Frequency
    low = lowcut / nyq_freq
    high = highcut / nyq_freq
    b, a = butter(order, [low, high], btype='band')
    return b,a


# Function to apply the filter 
def Bandpass_filter(data, lowcut, highcut, freq, order=5):
    b,a = get_filter_Coeff(lowcut, highcut, freq, order)
    fil_sig = lfilter(b, a, data)
    return fil_sig


def noise_removal(df_eeg):
    
    # constants for filter function
    lowcut = 0.5
    highcut = 20
    freq = 200

    # filter the eeg sample
    filtered_df = df_eeg.apply(lambda x: Bandpass_filter(x, lowcut, highcut, freq, order=5))
    
    return filtered_df

## Feature Extraction

### EEG to Spectrogram

In [9]:
# Feature Extraction

def z_score_normalization(data):
    mean_val = np.mean(data)
    std_dev = np.std(data)
    normalized_data = (data - mean_val) / std_dev
    return normalized_data


# Convert EEG to Spectrogram

def EEG_to_Spectrogram(df, path):
    
    # Normalize the values of the spectrogram array to help in generalization
#     spec_df = normalize_array(spec_df)
    norm_df = z_score_normalization(df)
    
    # generate spectrogram from eeg
    freqs, t, spec_df = stft(norm_df, 200, nperseg=50*200)
#     freqs, t, spec_df = signal.spectrogram(norm_df, fs=200)
    
    # Use the absolute value to get the magnitude spectrogram
    spectrogram = np.abs(spec_df)
    
#     spectrogram = np.squeeze(spectrogram, axis=-1).T  

    # Plot the spectrogram using Matplotlib
    plt.imshow(spectrogram, aspect='auto', origin='lower')
    # plt.colorbar(label='Power Spectral Density [dB]')
    plt.axis('off')
    # Save the spectrogram as an image
    plt.savefig(f"{path}.png")

In [10]:

def Feature_Extraction(filtered_df, group, off_class):
    print("Feature Extraction..........................")
    offset, c = off_class
    
    # extract subsamples from consolidated eeg
    df_sub = Extract_Subsample(filtered_df, int(offset))

    # Convert eeg to spectrogram
    path = "dataset_24/"+str(c)+"/"+str(group)+"_"+str(offset)

    EEG_to_Spectrogram(filtered_df, path)
    
# Function to generate spectrograms for each EEG file
def generate_spectrograms_parallel(df_eeg, group, offsets_classes):
    print("generate spectrograms")
    
    # multiprocessing
    global num_cpus

    print(f"No of CPUs : {num_cpus}")
    pool = mp.Pool(processes=num_cpus)
    
    results = pool.starmap(Feature_Extraction, [(df_eeg, group, off_class) for off_class in offsets_classes])
    
    pool.close()
    pool.join()
    
    return True
    

In [11]:
# Main
# EEG data frame
eeg_files = read_file(train_eeg_path)

# Read Spectrogram data
# spec_files = read_file(path_spec)

eeg_files_copy = eeg_files.copy()


In [None]:

# Main
warnings.filterwarnings('ignore')
start_time = time.time()

group = 0
off_classes = [] 
r = 1

for row in df_train.itertuples():
    
    if group == 0:
        group = row.eeg_id
        off_classes = [(row.eeg_label_offset_seconds, row.expert_consensus)]
    elif row.eeg_id == group:
        off_classes.append((row.eeg_label_offset_seconds, row.expert_consensus))
    elif group != row.eeg_id and group != 0:
        
        # get respective eeg file
        # eeg = find_eeg(eeg_files_copy, group)
        eeg = f"{train_eeg_path}{str(group)}.parquet"
        # print("eeg path ",eeg)
        
        # read eed df
        df_eeg = pd.read_parquet(eeg)
        
        # Pre-Processing - Noise Removal
        filtered_df = noise_removal(df_eeg)
        
        # Parallelization of Feature Extraction
        ret = generate_spectrograms_parallel(filtered_df, group, off_classes)
        # eeg_files_copy.remove(eeg)
        group = row.eeg_id 
        off_classes = [(row.eeg_label_offset_seconds, row.expert_consensus)]
    print("r------------- : ", r)
    r+=1
    
elapsed_time = time.time() - start_time

print(f"Elapsed time : {elapsed_time}")

# Write to csv - proc time
with open("proc_time.csv", "a+") as pc:
    pc.write(f'{num_cpus},{elapsed_time}\n')



#### Generating plots for Speed up and Elapsed Time

In [None]:
df_cpu = pd.read_csv("proc_time.csv")

In [None]:
# No of CPUs vs Elapsed Time

# Plot Num_CPUs vs Elapsed_time
plt.plot(df_cpu['Num_CPUs'], df_cpu['Elapsed_time'], marker='o', linestyle='-')

# Add labels and title
plt.xlabel('Number of CPUs')
plt.ylabel('Elapsed Time (seconds)')
plt.title('Number of CPUs vs Elapsed Time')

# Show plot
plt.grid(True)

plt.savefig('cpu_elapsed_time_plot.png') 

plt.show()


In [None]:
# Calculate speedup
df_cpu['Speedup'] = df_cpu['Elapsed_time'][0] / df_cpu['Elapsed_time']

# Plot Num_CPUs vs Elapsed_time
plt.plot(df_cpu['Num_CPUs'], df_cpu['Speedup'], marker='o', linestyle='-')

# Add labels and title
plt.xlabel('Number of CPUs')
plt.ylabel('Speed Up')
plt.title('Number of CPUs vs Speed Up')

# Show plot
plt.grid(True)

plt.savefig('cpu_speedup_plot.png') 

plt.show()
