In [1]:
import os, gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision

In [2]:
# Check if gpu is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Load in the data

In [3]:
data_folder = "./data/"

In [4]:
train_data = pd.read_csv(data_folder + "train.csv")
targets = train_data.columns[-6:]

In [5]:
targets

Index(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       'other_vote'],
      dtype='object')

In [6]:
print(train_data.shape)
train_data.head()

(106800, 15)


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


### Create non-overlapping training data

In [7]:
train = train_data.groupby("eeg_id")[["spectrogram_id","spectrogram_label_offset_seconds"]].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']

tmp = train_data.groupby("eeg_id")[["spectrogram_id", "spectrogram_label_offset_seconds"]].agg(
    {"spectrogram_label_offset_seconds": "max"})
train["max"] = tmp

tmp = train_data.groupby("eeg_id")[["patient_id"]].agg("first")
train["patient_id"] = tmp

# Get all the target values
tmp = train_data.groupby("eeg_id")[targets].agg("sum")
for target in targets:
    train[target] = tmp[target].values

# Transform the targets to values in [0,1]
y_data = train[targets].values
y_data = y_data / y_data.sum(axis=1, keepdims=True)
train[targets] = y_data

tmp = train_data.groupby("eeg_id")[["expert_consensus"]].agg("first")
train["target"] = tmp

train = train.reset_index()
print(f"Train shape: {train.shape}")

train.head()

Train shape: (17089, 12)


Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


### Read train spectrograms

In [9]:
%%time
READ_SPEC_FILES = False

# Read all spectrograms
path = data_folder + "train_spectrograms/"
files = os.listdir(path)
print(f"There are {len(files)} spectrogram parquets")

if READ_SPEC_FILES:
    spectrograms = {}
    for i, f in tqdm(enumerate(files)):
        if i%100==0: print(i, ", ", end="")
        tmp = pd.read_parquet(f"{path}{f}")
        name = int(f.split(".")[0])
        spectrograms[name] = tmp.iloc[:, 1:].values
    with open("./data/brain-spectrograms/specs.npy", "wb") as file:
        np.save(file, spectrograms, allow_pickle=True)
else:
    spectrograms = np.load("./data/brain-spectrograms/specs.npy", allow_pickle=True).item()

There are 11138 spectrogram parquets
CPU times: user 132 ms, sys: 8.74 s, total: 8.88 s
Wall time: 56.7 s


### Read EEG spectrograms

In [33]:
import librosa

FEATS = [['Fp1','F7','T3','T5','O1'],
         ['Fp1','F3','C3','P3','O1'],
         ['Fp2','F8','T4','T6','O2'],
         ['Fp2','F4','C4','P4','O2']]
USE_WAVELET = None

def spectrogram_from_eeg(parquet_path):
    # LOAD MIDDLE 50 SECONDS OF EEG SERIES
    eeg = pd.read_parquet(parquet_path)
    middle = (len(eeg)-10_000)//2
    eeg = eeg.iloc[middle:middle+10_000]
    
    # VARIABLE TO HOLD SPECTROGRAM
    img = np.zeros((128,256,4),dtype='float32')
    
    signals = []
    for k in range(4):
        COLS = FEATS[k]
        
        for kk in range(4):
        
            # COMPUTE PAIR DIFFERENCES
            x = eeg[COLS[kk]].values - eeg[COLS[kk+1]].values

            # FILL NANS
            m = np.nanmean(x)
            if np.isnan(x).mean()<1: x = np.nan_to_num(x,nan=m)
            else: x[:] = 0

            # DENOISE
            if USE_WAVELET:
                x = denoise(x, wavelet=USE_WAVELET)
            signals.append(x)

            # RAW SPECTROGRAM
            mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x)//256, 
                  n_fft=1024, n_mels=128, fmin=0, fmax=20, win_length=128)

            # LOG TRANSFORM
            width = (mel_spec.shape[1]//32)*32
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]

            # STANDARDIZE TO -1 TO 1
            mel_spec_db = (mel_spec_db+40)/40 
            img[:,:,k] += mel_spec_db
                
        # AVERAGE THE 4 MONTAGE DIFFERENCES
        img[:,:,k] /= 4.0

        
    return img


In [None]:
%%time
PATH = './data/train_eegs/'

directory_path = 'data/EEG_Spectrograms/'
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

EEG_IDS = train.eeg_id.unique()
all_eegs = {}

for i,eeg_id in tqdm(enumerate(EEG_IDS)):
        
    # CREATE SPECTROGRAM FROM EEG PARQUET
    img = spectrogram_from_eeg(f'{PATH}{eeg_id}.parquet')
    
    # SAVE TO DISK
    np.save(f'{directory_path}{eeg_id}',img)
    all_eegs[eeg_id] = img
   
# SAVE EEG SPECTROGRAM DICTIONARY
np.save('eeg_specs',all_eegs)

8937it [20:26,  4.96it/s]