### Upload dataset and collect all EEG idss, EEG segments (10 sec), vote labels

In [19]:
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

In [20]:
train = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
ids = train.eeg_id.unique()

In [21]:
# to take sample data instead of whole, 
import random
n = 10
ids1 = random.sample(list(ids), n)
ids1

[279165890,
 3875462264,
 2059148071,
 2860837259,
 985265752,
 1765692389,
 12784157,
 1384814292,
 3514562115,
 2250812463]

In [22]:
# CREATING DATA LOADER FOR SPECIFIC EEG IDS

import pandas as pd

def create_data_loader(eeg_ids, eeg_data_dir, train_data, segment_length=10):
    """
    Create a data loader function to extract 10-second EEG segments for each EEG ID.
    
    Args:
    - eeg_ids (list): List of EEG IDs for which segments need to be extracted.
    - eeg_data_dir (str): Directory path where EEG data files are stored.
    - train_data (DataFrame): DataFrame containing training data with EEG labels and offsets.
    - segment_length (int): Length of EEG segments in seconds.
    - 
    
    Returns:
    - data_loader (generator): Generator function to yield EEG segments along with target labels.
    """
    def data_loader():
        for eeg_id in eeg_ids:
            # Load EEG data for the current EEG ID
            eeg_data_path = f"{eeg_data_dir}/{eeg_id}.parquet"
            example = pd.read_parquet(eeg_data_path)
            
            # Filter training data for the current EEG ID
            train_eegid = train_data[train_data['eeg_id'] == eeg_id]
            target_labels = train_eegid.iloc[0][['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']]
            #print(target_labels)
            offset_values_list = train_eegid['eeg_label_offset_seconds'].tolist()
            print("Number of offset subsamples for EEG ID", eeg_id, ":", len(offset_values_list))
            
            # Extract 10-second EEG segments along with target labels
            for offset in offset_values_list:
                start_index = int(offset) * 200
                end_index = start_index + (segment_length * 200)
                
                # Extract 10-second segment centered around the offset
                middle_index = (start_index + end_index) // 2
                segment_start = middle_index - (segment_length // 2 * 200)
                segment_end = middle_index + (segment_length // 2 * 200)
                
                # Extract EEG segment
                eeg_segment = example.iloc[segment_start:segment_end].reset_index(drop=True)
                 
                # Yield EEG segment along with target labels
                yield eeg_segment, target_labels, eeg_id
    
    return data_loader

# usage:
eeg_ids = ids1   # change to ids if using  full data
eeg_data_dir = "/kaggle/input/hms-harmful-brain-activity-classification/train_eegs"
train_data = train  
segment_length = 10  # 10-second EEG segments
loader = create_data_loader(eeg_ids, eeg_data_dir, train_data, segment_length)

In [23]:
def eeg_dataframe(loader):
   
    # Collect all EEG segments and target labels into separate lists
    all_segments = []
    all_targets = []
    all_eeg_ids = []

    # Iterate through the data loader to extract EEG segments and target labels
    for eeg_segment, target_labels, eeg_id in loader():

        # Append each EEG segment to the list
        all_segments.append(eeg_segment)

        replicated_labels = pd.DataFrame([target_labels] * len(eeg_segment), columns=target_labels.index)

        eeg_id_tag = pd.DataFrame([eeg_id] * len(eeg_segment))
        all_eeg_ids.append(eeg_id_tag)


        # Append corresponding target labels to the list
        all_targets.append(replicated_labels)

    # Concatenate all segments into a single DataFrame
    full_eeg_segments = pd.concat(all_segments, ignore_index=True)
    # Concatenate all target labels into a single DataFrame
    full_target_labels = pd.concat(all_targets, ignore_index=True)
    # Concatenate all eeg_ids into a single DataFrame
    full_eeg_ids = pd.concat(all_eeg_ids, ignore_index=True)
    full_eeg_ids.columns = ['eeg_ids']
    
    # combine all eeg_ids, eeg segments, and labels into one dataframe 
    eeg_dataset = pd.concat([full_eeg_ids, full_eeg_segments, full_target_labels], axis = 1)


    return eeg_dataset


# usage
eeg_df = eeg_dataframe(loader)
eeg_df

Number of offset subsamples for EEG ID 279165890 : 1
Number of offset subsamples for EEG ID 3875462264 : 1
Number of offset subsamples for EEG ID 2059148071 : 1
Number of offset subsamples for EEG ID 2860837259 : 8
Number of offset subsamples for EEG ID 985265752 : 24
Number of offset subsamples for EEG ID 1765692389 : 1
Number of offset subsamples for EEG ID 12784157 : 1
Number of offset subsamples for EEG ID 1384814292 : 2
Number of offset subsamples for EEG ID 3514562115 : 4
Number of offset subsamples for EEG ID 2250812463 : 6


Unnamed: 0,eeg_ids,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,...,T4,T6,O2,EKG,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,279165890,7.890000,-28.549999,-28.910000,-33.259998,-26.670000,0.31,-22.690001,-23.920000,-7.650000,...,7.740000,-8.770000,12.340000,6.450000,0,0,0,1,0,1
1,279165890,3.810000,-22.620001,-20.020000,-19.469999,-18.020000,11.22,-10.220000,-11.760000,-13.970000,...,19.020000,4.410000,26.920000,25.160000,0,0,0,1,0,1
2,279165890,2.750000,-20.780001,-16.820000,-13.510000,-16.150000,15.58,-5.580000,-7.410000,-15.730000,...,21.070000,8.830000,30.000000,12.720000,0,0,0,1,0,1
3,279165890,3.750000,-30.610001,-24.180000,-25.980000,-27.910000,4.78,-17.330000,-18.370001,-11.380000,...,6.330000,-2.810000,16.879999,7.170000,0,0,0,1,0,1
4,279165890,2.710000,-29.240000,-19.129999,-21.690001,-26.040001,9.21,-12.360000,-14.320000,-13.190000,...,7.600000,-1.250000,20.049999,24.170000,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97995,2250812463,-23.100000,-30.910000,-9.900000,-6.230000,-25.750000,-15.91,-7.560000,-14.250000,-40.730000,...,-39.580002,-22.799999,4.050000,19600.990234,0,0,0,1,2,0
97996,2250812463,-22.410000,-27.740000,-7.490000,-5.280000,-11.440000,-5.14,2.660000,-10.850000,-47.160000,...,-26.879999,-1.530000,19.520000,6243.009766,0,0,0,1,2,0
97997,2250812463,-3.010000,-8.460000,12.200000,12.820000,6.960000,14.37,17.830000,6.780000,-24.420000,...,-5.070000,-0.100000,34.630001,-6752.600098,0,0,0,1,2,0
97998,2250812463,-7.000000,-14.620000,6.790000,9.940000,-10.100000,-0.61,8.920000,-1.510000,-28.440001,...,-21.209999,-11.580000,15.830000,5006.660156,0,0,0,1,2,0


### Functions for processing: denoising

In [24]:
eeg_data = eeg_df.iloc[:, 1:-7]
eeg_data

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2
0,7.890000,-28.549999,-28.910000,-33.259998,-26.670000,0.31,-22.690001,-23.920000,-7.650000,-16.760000,-24.700001,-28.730000,-9.77,-1.310000,0.480000,0.600000,7.740000,-8.770000,12.340000
1,3.810000,-22.620001,-20.020000,-19.469999,-18.020000,11.22,-10.220000,-11.760000,-13.970000,-7.960000,-13.470000,-26.260000,-4.11,10.670000,16.540001,9.010000,19.020000,4.410000,26.920000
2,2.750000,-20.780001,-16.820000,-13.510000,-16.150000,15.58,-5.580000,-7.410000,-15.730000,-3.570000,-8.720000,-24.309999,-1.17,14.160000,21.059999,7.800000,21.070000,8.830000,30.000000
3,3.750000,-30.610001,-24.180000,-25.980000,-27.910000,4.78,-17.330000,-18.370001,-11.380000,-12.660000,-18.639999,-35.570000,-10.26,3.450000,7.460000,-4.740000,6.330000,-2.810000,16.879999
4,2.710000,-29.240000,-19.129999,-21.690001,-26.040001,9.21,-12.360000,-14.320000,-13.190000,-9.890000,-15.800000,-37.919998,-10.87,5.300000,10.590000,-5.840000,7.600000,-1.250000,20.049999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97995,-23.100000,-30.910000,-9.900000,-6.230000,-25.750000,-15.91,-7.560000,-14.250000,-40.730000,12.650000,-7.320000,-7.530000,-13.85,20.440001,-19.930000,-35.980000,-39.580002,-22.799999,4.050000
97996,-22.410000,-27.740000,-7.490000,-5.280000,-11.440000,-5.14,2.660000,-10.850000,-47.160000,14.190000,-16.809999,-4.130000,-5.27,32.480000,-3.360000,-26.870001,-26.879999,-1.530000,19.520000
97997,-3.010000,-8.460000,12.200000,12.820000,6.960000,14.37,17.830000,6.780000,-24.420000,33.320000,-1.390000,13.930000,7.59,47.849998,6.840000,-11.080000,-5.070000,-0.100000,34.630001
97998,-7.000000,-14.620000,6.790000,9.940000,-10.100000,-0.61,8.920000,-1.510000,-28.440001,23.950001,2.120000,10.070000,-0.61,32.470001,-7.600000,-18.200001,-21.209999,-11.580000,15.830000


In [33]:
# denoising function using wavelet transform
import pywt

def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='db8', level=1):
    #x = x.iloc[:, 1:-7]
    ret = {key:[] for key in x.columns}
    
    for pos in x.columns:
        coeff = pywt.wavedec(x[pos], wavelet, mode="per")
        sigma = (1/0.6745) * maddest(coeff[-level])

        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

        ret[pos]=pywt.waverec(coeff, wavelet, mode='per')
    
    return pd.DataFrame(ret)

# usage
denoised_eeg_data = denoise(eeg_data, wavelet='db8')
denoised_eeg_data

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2
0,4.332842,-17.772476,-12.873289,-5.542573,-18.605249,5.188459,1.148719,-14.124043,-19.953886,6.831732,-13.232243,-17.716484,-5.783617,14.174281,8.615520,-6.371781,15.550014,-1.517132,17.200834
1,4.618274,-17.845270,-13.157931,-5.556269,-18.920111,5.013489,1.026922,-14.223031,-19.795052,6.775439,-13.245786,-17.720200,-5.785892,14.157237,8.784243,-5.557992,16.454737,-1.198766,17.196074
2,4.888547,-17.919270,-13.438831,-5.570139,-19.231485,4.841174,0.883310,-14.323848,-19.643494,6.718515,-13.257651,-17.736265,-5.788189,14.139989,8.953579,-4.766209,17.342197,-0.886742,17.193796
3,5.143430,-17.994612,-13.716125,-5.584208,-19.539206,4.671593,0.713407,-14.426377,-19.499321,6.660882,-13.267573,-17.764832,-5.790508,14.122518,9.123903,-3.997958,18.210545,-0.581886,17.194372
4,5.383331,-18.070974,-13.989039,-5.598442,-19.843225,4.504611,0.523737,-14.530319,-19.362339,6.602631,-13.275880,-17.805347,-5.792848,14.104843,9.294744,-3.255015,19.056608,-0.286395,17.197292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97995,2.747608,-17.407801,-11.390155,-5.475345,-16.978352,6.097236,1.626115,-13.641122,-20.823524,7.107841,-13.152782,-17.812592,-5.772555,14.257503,7.761717,-10.674914,10.835187,-3.177447,17.242012
97996,3.079128,-17.481842,-11.693113,-5.488642,-17.311031,5.911173,1.550369,-13.737527,-20.642784,7.053278,-13.170074,-17.784893,-5.774726,14.241109,7.934026,-9.793250,11.795159,-2.841069,17.231705
97997,3.408704,-17.554131,-11.993169,-5.501973,-17.639576,5.727014,1.467555,-13.832481,-20.462849,6.998502,-13.187038,-17.755960,-5.776918,14.224616,8.105019,-8.916706,12.749425,-2.505538,17.221855
97998,3.730586,-17.625984,-12.290049,-5.515377,-17.964548,5.545038,1.375613,-13.927716,-20.286551,6.943407,-13.203325,-17.731638,-5.779131,14.208004,8.275190,-8.050985,13.695091,-2.172224,17.213017


### Functions for feature extraction: DWT, Statistics

In [35]:
from pywt import wavedec

def wavelet_decompose_channels(data, level, output=False):
  # take every x number of points using numpy's slicing (start:stop:step)
    data = data[0::2]

    data.columns.name='channel'

    # transpose the data
    data_t = data.transpose()

    # get the wavelet coefficients at each level in a list
    coeffs_list = wavedec(data_t.values, wavelet='db4', level=level)
    print(len(coeffs_list))

    # make a list of the component names (later column rows)
    nums = list(range(1,level+1))
    names=[]
    for num in nums:
        names.append('D' + str(num))
    names.append('A' + str(nums[-1]))

  # reverse the names so it counts down
    names = names[::-1]  
    #print(names)

    i = 0
    wavelets = pd.DataFrame()
    for i in range(1, len(coeffs_list)):
    #for i, array in enumerate(coeffs_list):
        #print(i)
        array = coeffs_list[i]
        # turn into a dataframe and transpose
        level_df = pd.DataFrame(array)
        level_df.index = data.columns
        level_df['level'] = names[i]
        level_df= level_df.set_index('level', append=True)
        level_df=level_df.T
        # add the next levels df to another column
        wavelets = pd.concat([wavelets,level_df], axis=1, sort=True)

    # sort values along the channels
    wavelets = wavelets.sort_values(['channel', 'level'], axis=1)

  # remove the AN levels
  #regex = re.compile('D')
  #bad_items = [x for x in list(wavelets.columns.levels[1]) if not regex.match(x)]
  #decom_wavelets = wavelets.drop(bad_items, axis=1, level = 'level')

  #decom_wavelets.index.name='sample'

  #if output:
  #  display(decom_wavelets.head())

    wavelets_cleaned = wavelets.dropna()

    return wavelets_cleaned

dwt_wavelets = wavelet_decompose_channels(denoised_eeg_data, level=5, output=True)
dwt_wavelets

6


channel,C3,C3,C3,C3,C3,C4,C4,C4,C4,C4,...,T5,T5,T5,T5,T5,T6,T6,T6,T6,T6
level,D1,D2,D3,D4,D5,D1,D2,D3,D4,D5,...,D1,D2,D3,D4,D5,D1,D2,D3,D4,D5
0,-0.015101,0.242372,-0.880665,0.039720,0.076819,-0.000856,0.014575,-0.058673,0.002690,0.004782,...,-0.017564,0.123236,-0.580517,0.031778,0.050417,0.019178,-0.273309,0.924476,-0.041470,-0.086687
1,-0.025187,0.654762,-2.784487,0.299885,0.423376,-0.001426,0.038736,-0.185096,0.020594,0.027349,...,-0.020117,0.310950,-1.834036,0.212097,0.273469,0.031083,-0.745016,2.929285,-0.305989,-0.442245
2,0.038240,-0.576681,1.261823,2.410795,3.675301,0.002232,-0.034781,0.090290,0.163552,0.237959,...,0.023944,-0.399120,0.809517,1.472014,2.459178,-0.043591,0.663394,-1.232204,-2.392656,-3.634939
3,0.000817,0.197189,-0.697858,-2.673963,-2.442850,0.000012,0.011360,-0.042815,-0.166298,-0.432029,...,0.007248,0.201667,-0.347540,-2.164778,-5.644333,-0.002610,-0.240390,0.620932,2.399833,-2.770917
4,-0.000948,-0.004396,0.608152,1.397838,-1.238197,-0.000016,0.000483,0.017338,0.046849,0.088974,...,-0.006170,-0.051235,0.184384,1.159339,2.933892,0.000646,-0.012462,-0.731629,-2.541135,9.865311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,0.000043,-0.026566,0.004074,-0.001284,23.796558,-0.002918,0.005614,-0.001419,2.077418,-0.005866,...,0.003065,-0.021910,0.001663,-0.084142,-10.412962,-0.002227,-0.005409,-0.000504,2.528481,-16.430883
1534,-0.000003,-0.017005,-0.002106,-0.002294,-4.111217,0.002899,-0.000827,-0.000671,3.489241,0.260254,...,-0.001671,-0.037491,0.002197,-0.115690,7.211676,0.001218,0.005715,-0.003799,4.183121,6.807277
1535,-0.000020,0.026174,-0.003540,0.005750,-1.326857,-0.011215,-0.007290,-0.000656,-2.591595,-0.237134,...,-0.005485,0.040389,-0.002792,0.139646,-4.271112,0.003875,-0.011047,0.003159,-3.129589,-0.750194
1536,0.000028,0.008068,-0.003830,0.011316,-7.682982,0.011362,-0.000383,-0.001938,-3.422895,-0.004943,...,0.002610,0.029901,-0.001312,0.293750,-6.559363,-0.001833,-0.014293,-0.001205,-3.807739,9.279816


In [30]:
def MAV(data):
    # Initialize an empty DataFrame to store the means for each channel
    means = pd.DataFrame(index=data.index)

    # Iterate over each channel and calculate the mean across 'D1' to 'D5'
    for channel in data.columns.get_level_values(0).unique():
    # Calculate the mean for the current channel
        means[channel] = data[channel].abs().mean(axis=1)
    
    means.columns = [f"{col}_DT_MAV" for col in means.columns]
    
    return means


def MAVP(data):
    # Initialize an empty DataFrame to store the means for each channel
    means_abs = pd.DataFrame(index=data.index)

    # Iterate over each channel and calculate the mean across 'D1' to 'D5'
    for channel in data.columns.get_level_values(0).unique():
    # Calculate the mean for the current channel
        means_abs[channel] = (data[channel]**2).mean(axis = 1)
    
    means_abs.columns = [f"{col}_DT_MAVP" for col in means_abs.columns]
    
    return means_abs



def std_val(data):
    # Initialize an empty DataFrame to store the means for each channel
    std_vals = pd.DataFrame(index=data.index)

    # Iterate over each channel and calculate the mean across 'D1' to 'D5'
    for channel in data.columns.get_level_values(0).unique():
    # Calculate the mean for the current channel
        std_vals[channel] = data[channel].std(axis = 1)
    
    std_vals.columns = [f"{col}_DT_STD" for col in std_vals.columns]
    
    return std_vals


def var_val(data):
    # Initialize an empty DataFrame to store the means for each channel
    var = pd.DataFrame(index=data.index)

    # Iterate over each channel and calculate the mean across 'D1' to 'D5'
    for channel in data.columns.get_level_values(0).unique():
    # Calculate the mean for the current channel
        var[channel] = data[channel].var(axis = 1)
    
    var.columns = [f"{col}_DT_VAR" for col in var.columns]
    
    return var


def ratio_channels(epoch_data):
    # Initialize an empty DataFrame to store the ratio values for each pair of channels
    ratio_data = pd.DataFrame(index=epoch_data.index)
    
    # Iterate over each pair of adjacent channels
    for i in range(len(epoch_data.columns) - 1):
        channel1 = epoch_data.columns[i]
        channel2 = epoch_data.columns[i + 1]
        
        # Calculate the ratio between the values of the two channels
        ratio_data[f"{channel1}-{channel2}_Ratio"] = epoch_data[channel1] / epoch_data[channel2]
    
    return ratio_data


from scipy.stats import entropy

def shannon_entropy(data):
    # Initialize an empty DataFrame to store the entropy values for each channel
    entropy_df = pd.DataFrame(index=data.index)

    # Iterate over each channel and calculate the entropy across 'D1' to 'D5'
    for channel in data.columns.get_level_values(0).unique():
        # Calculate the entropy for the current channel
        entropy_values = data[channel].apply(lambda x: entropy(x.abs(), base=2), axis=1)
        entropy_df[channel + '_entropy'] = entropy_values

    return entropy_df



In [37]:
def eeg_feature(data):
    
    wavelet_mean = MAV(data)
    wavelet_meanabs = MAVP(data)
    wavelet_std = std_val(data)
    wavelet_var = var_val(data)
    wavelet_ratio = ratio_channels(data)
    wavelet_entropy = shannon_entropy(data)
    
    wavelet_statistics = pd.concat([data, wavelet_mean, wavelet_meanabs, wavelet_std, wavelet_var, wavelet_ratio, wavelet_entropy], axis=1)
    
    return wavelet_statistics


eeg_features = eeg_feature(dwt_wavelets)
eeg_features

Unnamed: 0,"(C3, D1)","(C3, D2)","(C3, D3)","(C3, D4)","(C3, D5)","(C4, D1)","(C4, D2)","(C4, D3)","(C4, D4)","(C4, D5)",...,Fz_entropy,O1_entropy,O2_entropy,P3_entropy,P4_entropy,Pz_entropy,T3_entropy,T4_entropy,T5_entropy,T6_entropy
0,-0.015101,0.242372,-0.880665,0.039720,0.076819,-0.000856,0.014575,-0.058673,0.002690,0.004782,...,1.286874,1.292708,1.604140,1.263786,1.279123,1.256582,1.277983,1.312528,1.309182,1.336101
1,-0.025187,0.654762,-2.784487,0.299885,0.423376,-0.001426,0.038736,-0.185096,0.020594,0.027349,...,1.472392,1.444563,1.519524,1.435429,1.445017,1.496133,1.452349,1.466671,1.413379,1.475386
2,0.038240,-0.576681,1.261823,2.410795,3.675301,0.002232,-0.034781,0.090290,0.163552,0.237959,...,1.746954,1.779199,1.320384,1.772311,1.775504,1.629387,1.761552,1.779818,1.766360,1.793911
3,0.000817,0.197189,-0.697858,-2.673963,-2.442850,0.000012,0.011360,-0.042815,-0.166298,-0.432029,...,1.105465,1.130920,0.790497,1.241476,1.183186,0.950185,1.140343,1.547433,1.216704,1.572275
4,-0.000948,-0.004396,0.608152,1.397838,-1.238197,-0.000016,0.000483,0.017338,0.046849,0.088974,...,1.334094,0.971230,1.006736,1.283219,1.035051,0.951479,1.001088,1.413223,1.172927,1.011481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,0.000043,-0.026566,0.004074,-0.001284,23.796558,-0.002918,0.005614,-0.001419,2.077418,-0.005866,...,0.030388,0.725611,0.845270,1.455034,0.739859,0.809132,0.457232,0.874427,0.094770,0.572260
1534,-0.000003,-0.017005,-0.002106,-0.002294,-4.111217,0.002899,-0.000827,-0.000671,3.489241,0.260254,...,0.109953,1.010452,0.894709,0.370518,1.008303,0.963120,0.194373,0.985328,0.169510,0.970042
1535,-0.000020,0.026174,-0.003540,0.005750,-1.326857,-0.011215,-0.007290,-0.000656,-2.591595,-0.237134,...,0.301669,1.117538,0.446315,0.402347,1.003022,0.106768,0.350606,0.978979,0.296118,0.754058
1536,0.000028,0.008068,-0.003830,0.011316,-7.682982,0.011362,-0.000383,-0.001938,-3.422895,-0.004943,...,0.023956,1.007477,0.998330,1.697991,0.172810,1.006373,0.046366,0.931595,0.301820,0.884450
