In [4]:
import pandas
import numpy as np
from utils import *
import mne
import matplotlib.pyplot as plt
from os import walk
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import normalize, StandardScaler
from itertools import combinations
from tqdm.notebook import tqdm
import pickle

from multiprocessing import Pool
plt.style.use('seaborn-whitegrid')

# EEG data

In [37]:
def load_data():
    path, folders, filenames = next(walk('./data'))

    filenames.remove('11-audio.csv')
    filenames.remove('11-image.csv')

    filenames.remove('36-audio.csv')
    filenames.remove('36-image.csv')

    path = './data'
    columns = {'Unnamed: 1':'Fp1',
            'Unnamed: 2':'Fp2',
            'Unnamed: 3':'F3',
            'Unnamed: 4':'F4',
            'Unnamed: 5':'F7',
            'Unnamed: 6':'F8',
            'Unnamed: 7':'P7',
            'Unnamed: 8':'P8'}

    EEG_audio, EEG_image = dict(), dict()
    from itertools import product
    categories = [1,2,3,4,5]
    blocks = [1,2]
    with tqdm(filenames) as t:
        for filename in t:
            t.set_description(f"{filename}")
            participant_id, stimuli = filename.split('-')
            stimuli = stimuli.rstrip('.csv')
            data = pandas.read_csv(f'{path}/{filename}', dtype={'Marker': str}).rename(columns=columns).drop(columns='timestamps')
            # Aviod warning on stim has negative value
            marker = np.array(data['Marker'])
            marker[marker == '-1'] = '1'
            data['Marker'] = marker

            if(stimuli == 'audio'):
                EEG_audio[int(participant_id)] = data
            elif(stimuli == 'image'):
                EEG_image[int(participant_id)] = data
            else:
                raise ValueError(f"Stimuli:{stimuli} is unexpected.")
    return EEG_audio, EEG_image


In [38]:
# clear_cache()
try:
    # Load from cache
    EEG_audio = load('EEG_audio')
    EEG_image = load('EEG_image')
    print('Load data from cache')
    if( set(EEG_audio.keys()) != set(EEG_image.keys()) ):
        extra = None
        if(len(EEG_audio.keys()) > len(EEG_image.keys())):
            extra = set(EEG_audio.keys()).difference( set(EEG_image.keys()) )
        else:
            extra = set(EEG_image.keys()).difference( set(EEG_audio.keys()) )
        raise ValueError(f"In equal keys. audio has {len(EEG_audio.keys())} and image has {len(EEG_image.keys())}. The extra key is {extra}")

except FileNotFoundError as e:
    EEG_audio, EEG_image = load_data()
    # Save to cache
    save(EEG_audio, 'EEG_audio')
    save(EEG_image, 'EEG_image')

  0%|          | 0/60 [00:00<?, ?it/s]

# Preprocess data

In [183]:
def build_data(p_num, EEG,ids):
    X = []
    y = []
    # Delta, Theta, Alpha, Beta, Gamma
    filter_list = [[0,4],[4,8],[8,12],[13,30],[30,125]]
    with tqdm(ids) as t:
        for id in t:
            t.set_description(f"{id}")
            raw = dataframe_to_raw(EEG[id], sfreq=250)
            
            # Preprocess
            raw.notch_filter([50,100],filter_length='auto', phase='zero', verbose=False) # Line power
            raw.filter(1., None, fir_design='firwin', verbose=False) # Slow drift
            events = mne.find_events(raw, stim_channel='Marker', initial_event=True, verbose=False, uint_cast=False)
            events = np.delete(events,np.argwhere(events[:,2] == 1), axis=0) # break
            # Check data
            if(events.shape[0] != 50):
                raise ValueError(f"Event missing: {events[:,2]}. len(events.shape[0])={events.shape[0]}")
            epochs = mne.Epochs(raw, events, tmin=0.3, tmax=5.8, baseline=(0.3,0.3), verbose=False)
            if(epochs.get_data().shape[0] != 50):
                raise ValueError(f"There might be a bad data. epochs.get_data().shape = {epochs.get_data().shape}")

            # Extract features
            for evoked in tqdm(epochs.iter_evoked(), leave=False):
                event = int(evoked.comment[0])
                sft = abs(mne.time_frequency.stft(epoch.data[:8], wsize=256, verbose=False).mean(axis=2).mean(axis=0))
                features = []
                for f in filter_list:
                    features.append(sft[f[0]:f[1]+1].mean())
                X.append(features)
                y.append(event)
    print(f"{p_num} done| {ids}")
    return np.array(X),np.array(y)

In [192]:
# [33,2,10,12,16]
pool = Pool()
ids = np.array(list(EEG_image.keys()))
p1 = pool.apply_async(build_data, [1,EEG_image,ids[0::6]])
p2 = pool.apply_async(build_data, [2,EEG_image,ids[1::6]])
p3 = pool.apply_async(build_data, [3,EEG_image,ids[2::6]])
p4 = pool.apply_async(build_data, [4,EEG_image,ids[3::6]])
p5 = pool.apply_async(build_data, [5,EEG_image,ids[4::6]])
p6 = pool.apply_async(build_data, [6,EEG_image,ids[5::6]])
ans1 = p1.get(timeout=(40))
ans2 = p2.get(timeout=(40))
ans3 = p3.get(timeout=(40))
ans4 = p4.get(timeout=(40))
ans5 = p5.get(timeout=(40))
ans6 = p6.get(timeout=(40))
X = np.concatenate([ans1[0] , ans2[0], ans3[0], ans4[0] ,ans5[0], ans6[0]])
y = np.concatenate([ans1[1] , ans2[1], ans3[1], ans4[1] ,ans5[1], ans6[1]])
print(X.shape, y.shape)

1 done| [10 17 23  3 35]
6 done| [16 22 29 34  9]
4 done| [14 20 27 32  7]
3 done| [13  2 26 31  6]
5 done| [15 21 28 33  8]
2 done| [12 18 25 30  5]


In [133]:
def get_acc(X,y):
    X_norm = normalize(X, axis=0)
    gnb = GaussianNB()
    gnb.fit(X_norm, y)  
    pred = gnb.predict(X_norm)
    acc = sum(y == pred)/len(y)
    return acc

In [137]:
# X_norm = normalize(X, axis=0)
gnb = GaussianNB()
gnb.fit(X, y)  
pred = gnb.predict(X)
acc = sum(y == pred)/len(y)
acc

0.2

In [138]:
print(y)

[5 5 5 ... 3 3 3]
