In [1]:
import pandas
import numpy as np
from utils import *
import mne
import matplotlib.pyplot as plt
from os import walk
from scipy import signal
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import normalize, StandardScaler
from itertools import combinations
from tqdm.notebook import tqdm
import pickle

from multiprocessing import Pool
plt.style.use('seaborn-whitegrid')

# EEG data

In [2]:
def load_data():
    path, folders, filenames = next(walk('./data'))

    filenames.remove('11-audio.csv')
    filenames.remove('11-image.csv')

    filenames.remove('36-audio.csv')
    filenames.remove('36-image.csv')

    path = './data'
    columns = {'Unnamed: 1':'Fp1',
            'Unnamed: 2':'Fp2',
            'Unnamed: 3':'F3',
            'Unnamed: 4':'F4',
            'Unnamed: 5':'F7',
            'Unnamed: 6':'F8',
            'Unnamed: 7':'P7',
            'Unnamed: 8':'P8'}

    EEG_audio, EEG_image = dict(), dict()
    from itertools import product
    categories = [1,2,3,4,5]
    blocks = [1,2]
    with tqdm(filenames) as t:
        for filename in t:
            t.set_description(f"{filename}")
            participant_id, stimuli = filename.split('-')
            stimuli = stimuli.rstrip('.csv')
            data = pandas.read_csv(f'{path}/{filename}', dtype={'Marker': str}).rename(columns=columns).drop(columns='timestamps')
            # Aviod warning on stim has negative value
            marker = np.array(data['Marker'])
            marker[marker == '-1'] = '1'
            data['Marker'] = marker

            if(stimuli == 'audio'):
                EEG_audio[int(participant_id)] = data
            elif(stimuli == 'image'):
                EEG_image[int(participant_id)] = data
            else:
                raise ValueError(f"Stimuli:{stimuli} is unexpected.")
    return EEG_audio, EEG_image


In [3]:
# clear_cache()
try:
    # Load from cache
    EEG_audio = load('EEG_audio')
    EEG_image = load('EEG_image')
    print('Load data from cache')
    if( set(EEG_audio.keys()) != set(EEG_image.keys()) ):
        extra = None
        if(len(EEG_audio.keys()) > len(EEG_image.keys())):
            extra = set(EEG_audio.keys()).difference( set(EEG_image.keys()) )
        else:
            extra = set(EEG_image.keys()).difference( set(EEG_audio.keys()) )
        raise ValueError(f"In equal keys. audio has {len(EEG_audio.keys())} and image has {len(EEG_image.keys())}. The extra key is {extra}")

except FileNotFoundError as e:
    EEG_audio, EEG_image = load_data()
    # Save to cache
    save(EEG_audio, 'EEG_audio')
    save(EEG_image, 'EEG_image')

Load data from cache


# Preprocess data
## STFT

In [35]:
#### Test section ####
raw = dataframe_to_raw(EEG_image[33], sfreq=250)

# Preprocess
raw.notch_filter([50,100],filter_length='auto', phase='zero', verbose=False) # Line power
raw.filter(1., None, fir_design='firwin', verbose=False) # Slow drift
events = mne.find_events(raw, stim_channel='Marker', initial_event=True, verbose=False, uint_cast=False)
events = np.delete(events,np.argwhere(events[:,2] == 1), axis=0) # break
# Check data
if(events.shape[0] != 50):
    raise ValueError(f"Event missing: {events[:,2]}. len(events.shape[0])={events.shape[0]}")
epochs = mne.Epochs(raw, events, tmin=0.3, tmax=5.8, baseline=(0.3,0.3), verbose=False)
if(epochs.get_data().shape[0] != 50):
    raise ValueError(f"There might be a bad data. epochs.get_data().shape = {epochs.get_data().shape}")
for evoked in tqdm(epochs.iter_evoked(), leave=False):
    break

0it [00:00, ?it/s]

In [48]:
sft = abs(mne.time_frequency.stft(evoked.data[:8], wsize=256, verbose=False))

In [49]:
filter_list = [[0,4],[4,8],[8,13],[13,30],[30,125]]
sft = sft.mean(axis=2)
sft.shape

(8, 129)

In [50]:
sft[:,0:4].mean(axis=1).shape
features = []
for f in filter_list:
    features.append(sft[:,f[0]:f[1]+1].mean(axis=1))

In [54]:
np.array(features).reshape(-1).shape

(40,)

In [56]:
def build_data(p_num, EEG,ids):
    X = []
    y = []
    # Delta, Theta, Alpha, Beta, Gamma
    filter_list = [[0,4],[4,8],[8,13],[13,30],[30,125]]
    with tqdm(ids) as t:
        for index, id in enumerate(t):
            t.set_description(f"{id}")
            print(f"p_no={p_num}|index={index}|id={id}")
            raw = dataframe_to_raw(EEG[id], sfreq=250)
            
            # Preprocess
            raw.notch_filter([50,100],filter_length='auto', phase='zero', verbose=False) # Line power
            raw.filter(1., None, fir_design='firwin', verbose=False) # Slow drift
            events = mne.find_events(raw, stim_channel='Marker', initial_event=True, verbose=False, uint_cast=False)
            events = np.delete(events,np.argwhere(events[:,2] == 1), axis=0) # break
            # Check data
            if(events.shape[0] != 50):
                raise ValueError(f"Event missing: {events[:,2]}. len(events.shape[0])={events.shape[0]}")
            epochs = mne.Epochs(raw, events, tmin=0.3, tmax=5.8, baseline=(0.3,0.3), verbose=False)
            if(epochs.get_data().shape[0] != 50):
                raise ValueError(f"There might be a bad data. epochs.get_data().shape = {epochs.get_data().shape}")

            # Extract features
            for evoked in tqdm(epochs.iter_evoked(), leave=False):
                event = int(evoked.comment[0])
                sft = abs(mne.time_frequency.stft(evoked.data[:8], wsize=256, verbose=False).mean(axis=2))
                features = []
                for f in filter_list:
                    features.append(sft[:,f[0]:f[1]+1].mean(axis=1))
                X.append(np.array(features).reshape(-1))
                y.append(event)
    print(f"{p_num} done| {ids}")
    return np.array(X),np.array(y)

In [57]:
try:
    # [33,2,10,12,16]
    t_out = 100
    pool = Pool()
    ids = np.array(list(EEG_image.keys()))
    p1 = pool.apply_async(build_data, [1,EEG_image,ids[0::6]])
    p2 = pool.apply_async(build_data, [2,EEG_image,ids[1::6]])
    p3 = pool.apply_async(build_data, [3,EEG_image,ids[2::6]])
    p4 = pool.apply_async(build_data, [4,EEG_image,ids[3::6]])
    p5 = pool.apply_async(build_data, [5,EEG_image,ids[4::6]])
    p6 = pool.apply_async(build_data, [6,EEG_image,ids[5::6]])
    ans1 = p1.get(timeout=t_out)
    ans2 = p2.get(timeout=t_out)
    ans3 = p3.get(timeout=t_out)
    ans4 = p4.get(timeout=t_out)
    ans5 = p5.get(timeout=t_out)
    ans6 = p6.get(timeout=t_out)
    X = np.concatenate([ans1[0] , ans2[0], ans3[0], ans4[0] ,ans5[0], ans6[0]])
    y = np.concatenate([ans1[1] , ans2[1], ans3[1], ans4[1] ,ans5[1], ans6[1]])
    print(X.shape, y.shape)
finally:
    print("========= close ========")
    pool.close() 
    pool.terminate()

p_no=1|index=0|id=10
p_no=2|index=0|id=12
p_no=3|index=0|id=13
p_no=4|index=0|id=14
p_no=5|index=0|id=15
p_no=6|index=0|id=16


0it [00:00, ?it/s]

p_no=1|index=1|id=17


0it [00:00, ?it/s]

p_no=3|index=1|id=2


0it [00:00, ?it/s]

p_no=4|index=1|id=20


0it [00:00, ?it/s]

p_no=2|index=1|id=18


0it [00:00, ?it/s]

p_no=6|index=1|id=22


0it [00:00, ?it/s]

p_no=5|index=1|id=21


0it [00:00, ?it/s]

p_no=3|index=2|id=26


0it [00:00, ?it/s]

0it [00:00, ?it/s]

p_no=1|index=2|id=23
p_no=2|index=2|id=25


0it [00:00, ?it/s]

p_no=4|index=2|id=27


0it [00:00, ?it/s]

p_no=6|index=2|id=29


0it [00:00, ?it/s]

p_no=5|index=2|id=28


0it [00:00, ?it/s]

p_no=3|index=3|id=31


0it [00:00, ?it/s]

p_no=2|index=3|id=30


0it [00:00, ?it/s]

p_no=1|index=3|id=3


0it [00:00, ?it/s]

p_no=4|index=3|id=32


0it [00:00, ?it/s]

p_no=5|index=3|id=33


0it [00:00, ?it/s]

p_no=6|index=3|id=34


0it [00:00, ?it/s]

p_no=3|index=4|id=6


0it [00:00, ?it/s]

p_no=1|index=4|id=35


0it [00:00, ?it/s]

p_no=4|index=4|id=7


0it [00:00, ?it/s]

p_no=5|index=4|id=8


0it [00:00, ?it/s]

p_no=6|index=4|id=9


0it [00:00, ?it/s]

p_no=2|index=4|id=5


0it [00:00, ?it/s]

1 done| [10 17 23  3 35]


0it [00:00, ?it/s]

0it [00:00, ?it/s]

5 done| [15 21 28 33  8]
3 done| [13  2 26 31  6]


0it [00:00, ?it/s]

6 done| [16 22 29 34  9]


0it [00:00, ?it/s]

4 done| [14 20 27 32  7]


0it [00:00, ?it/s]

2 done| [12 18 25 30  5]
(1500, 40) (1500,)


In [59]:
def get_acc(X,y):
    X_norm = normalize(X, axis=0)
    gnb = GaussianNB()
    gnb.fit(X_norm, y)  
    pred = gnb.predict(X_norm)
    acc = sum(y == pred)/len(y)
    return acc
get_acc(X,y)

0.218

## PSD

In [95]:
def build_data(p_num, EEG,ids):
    X = None
    y = []
    # Delta, Theta, Alpha, Beta, Gamma
    filter_list = [[0,4],[4,8],[8,13],[13,30],[30,125]]
    with tqdm(ids) as t:
        for index, id in enumerate(t):
            t.set_description(f"{id}")
            print(f"p_no={p_num}|index={index}|id={id}")
            raw = dataframe_to_raw(EEG[id], sfreq=250)
            
            # Preprocess
            raw.notch_filter([50,100],filter_length='auto', phase='zero', verbose=False) # Line power
            raw.filter(1., None, fir_design='firwin', verbose=False) # Slow drift
            events = mne.find_events(raw, stim_channel='Marker', initial_event=True, verbose=False, uint_cast=False)
            events = np.delete(events,np.argwhere(events[:,2] == 1), axis=0) # break
            # Check data
            if(events.shape[0] != 50):
                raise ValueError(f"Event missing: {events[:,2]}. len(events.shape[0])={events.shape[0]}")
            epochs = mne.Epochs(raw, events, tmin=0.3, tmax=5.8, baseline=(0.3,0.3), verbose=False)
            if(epochs.get_data().shape[0] != 50):
                raise ValueError(f"There might be a bad data. epochs.get_data().shape = {epochs.get_data().shape}")

            # Extract features
            powers,freq = mne.time_frequency.psd_welch(epochs,n_fft = 250, verbose=False)
            features = np.mean(powers, axis=1)
            # print(features.shape) #(50,8,65)
            for e in range(features.shape[0]):
                row = np.expand_dims(features[e].reshape(-1), axis=0)
                row = 10 * np.log10(row)
                if(type(X) == type(None)): X = row
                else: X = np.concatenate( [X, row ], axis=0 )

                y.append(events[e][0])

    print(f"{p_num} done| {ids}")
    return np.array(X),np.array(y)

In [96]:
try:
    # [33,2,10,12,16]
    t_out = 100
    pool = Pool()
    ids = np.array(list(EEG_image.keys()))
    p1 = pool.apply_async(build_data, [1,EEG_image,ids[0::6]])
    p2 = pool.apply_async(build_data, [2,EEG_image,ids[1::6]])
    p3 = pool.apply_async(build_data, [3,EEG_image,ids[2::6]])
    p4 = pool.apply_async(build_data, [4,EEG_image,ids[3::6]])
    p5 = pool.apply_async(build_data, [5,EEG_image,ids[4::6]])
    p6 = pool.apply_async(build_data, [6,EEG_image,ids[5::6]])
    ans1 = p1.get(timeout=t_out)
    ans2 = p2.get(timeout=t_out)
    ans3 = p3.get(timeout=t_out)
    ans4 = p4.get(timeout=t_out)
    ans5 = p5.get(timeout=t_out)
    ans6 = p6.get(timeout=t_out)
    X = np.concatenate([ans1[0] , ans2[0], ans3[0], ans4[0] ,ans5[0], ans6[0]])
    y = np.concatenate([ans1[1] , ans2[1], ans3[1], ans4[1] ,ans5[1], ans6[1]])
    print(X.shape, y.shape)
finally:
    print("========= close ========")
    pool.close() 
    pool.terminate()

p_no=1|index=0|id=10
p_no=2|index=0|id=12
p_no=3|index=0|id=13
p_no=4|index=0|id=14
p_no=5|index=0|id=15
p_no=6|index=0|id=16
p_no=1|index=1|id=17
p_no=2|index=1|id=18
p_no=3|index=1|id=2
p_no=4|index=1|id=20
p_no=6|index=1|id=22
p_no=5|index=1|id=21
p_no=4|index=2|id=27
p_no=2|index=2|id=25
p_no=1|index=2|id=23
p_no=5|index=2|id=28
p_no=3|index=2|id=26
p_no=6|index=2|id=29
p_no=5|index=3|id=33
p_no=6|index=3|id=34
p_no=4|index=3|id=32
p_no=1|index=3|id=3
p_no=2|index=3|id=30
p_no=3|index=3|id=31
p_no=5|index=4|id=8
p_no=4|index=4|id=7
p_no=6|index=4|id=9
p_no=3|index=4|id=6
p_no=1|index=4|id=35
p_no=2|index=4|id=5
5 done| [15 21 28 33  8]
4 done| [14 20 27 32  7]
3 done| [13  2 26 31  6]
1 done| [10 17 23  3 35]
6 done| [16 22 29 34  9]
2 done| [12 18 25 30  5]
(1500, 126) (1500,)


In [98]:
def get_acc(X,y):
    X_norm = normalize(X,axis=1)
    # X_norm = X
    gnb = GaussianNB()
    gnb.fit(X_norm, y)  
    pred = gnb.predict(X_norm)
    acc = sum(y == pred)/len(y)
    return acc
get_acc(X,y)

1.0

In [100]:
try:
    # [33,2,10,12,16]
    t_out = 100
    pool = Pool()
    ids = np.array(list(EEG_image.keys()))
    p1 = pool.apply_async(build_data, [1,EEG_audio,ids[0::6]])
    p2 = pool.apply_async(build_data, [2,EEG_audio,ids[1::6]])
    p3 = pool.apply_async(build_data, [3,EEG_audio,ids[2::6]])
    p4 = pool.apply_async(build_data, [4,EEG_audio,ids[3::6]])
    p5 = pool.apply_async(build_data, [5,EEG_audio,ids[4::6]])
    p6 = pool.apply_async(build_data, [6,EEG_audio,ids[5::6]])
    ans1 = p1.get(timeout=t_out)
    ans2 = p2.get(timeout=t_out)
    ans3 = p3.get(timeout=t_out)
    ans4 = p4.get(timeout=t_out)
    ans5 = p5.get(timeout=t_out)
    ans6 = p6.get(timeout=t_out)
    X = np.concatenate([ans1[0] , ans2[0], ans3[0], ans4[0] ,ans5[0], ans6[0]])
    y = np.concatenate([ans1[1] , ans2[1], ans3[1], ans4[1] ,ans5[1], ans6[1]])
    print(X.shape, y.shape)
finally:
    print("========= close ========")
    pool.close() 
    pool.terminate()

p_no=1|index=0|id=10
p_no=2|index=0|id=12
p_no=3|index=0|id=13
p_no=4|index=0|id=14
p_no=5|index=0|id=15
p_no=6|index=0|id=16
p_no=2|index=1|id=18
p_no=5|index=1|id=21
p_no=1|index=1|id=17
p_no=4|index=1|id=20
p_no=3|index=1|id=2
p_no=6|index=1|id=22
p_no=2|index=2|id=25
p_no=3|index=2|id=26
p_no=1|index=2|id=23
p_no=5|index=2|id=28
p_no=4|index=2|id=27
p_no=6|index=2|id=29
p_no=3|index=3|id=31
p_no=1|index=3|id=3
p_no=4|index=3|id=32
p_no=6|index=3|id=34
p_no=2|index=3|id=30
p_no=5|index=3|id=33
p_no=3|index=4|id=6
p_no=1|index=4|id=35
p_no=4|index=4|id=7
p_no=2|index=4|id=5
p_no=6|index=4|id=9
p_no=5|index=4|id=8
3 done| [13  2 26 31  6]
4 done| [14 20 27 32  7]
1 done| [10 17 23  3 35]
6 done| [16 22 29 34  9]
5 done| [15 21 28 33  8]
2 done| [12 18 25 30  5]
(1500, 126) (1500,)


In [103]:
def get_acc(X,y):
    X_norm = normalize(X,axis=1)
    # X_norm = X
    gnb = GaussianNB()
    gnb.fit(X_norm, y)  
    pred = gnb.predict(X_norm)
    acc = sum(y == pred)/len(y)
    return acc
get_acc(X,y)

1.0