In this notebook, we will see how we can preprocess the data that was downloaded from Open Neuro. 

The preproces we used 
- `filter` to filter the signals between desired Hz
- `resample` to resample the eeg signal from acqusition frequency to a desired frequency
- `ICA` to remove `ecg, eog` related artifacts 
- `fized length epochs` to break the continuous signal to number of samples

The following approaches were used to preproces that data

Approach 1

- filter between 1 and 40
- resample from 500hz to 100 hz
- remove artifacts based on ICA
- epoch 50s

Approach 2

- filter between 1 and 40
- resample from 500hz to 100 hz
- remove artifacts based on ICA
- epoch 50s and average

Approach 3

- filter between 1 and 20
- resample from 500hz to 100 hz
- remove artifacts based on ICA on epochs
- epoch 50s

Approach 4

- filter between 1 and 20
- resample from 500hz to 100 hz
- remove artifacts based on ICA on epochs
- epoch 50s and average


In [None]:
!pip install fastcore mne[data] mne-bids PyQt5 -Uqq

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/colab_notebooks/algovera/lynxhack

/content/drive/MyDrive/colab_notebooks/algovera/lynxhack


In [None]:
import os
import matplotlib
from pathlib import Path
import mne
import mne_bids
import numpy as np
from glob import glob
from fastcore.parallel import parallel
import pandas as pd
import pickle

In [None]:
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.preprocessing import StandardScaler

In [None]:
def prepare_approach12(fn):
    '''
    approach1
    filter - 1 and 40
    resample - 100hz
    ica - 20 components
    epoch - 50s

    approach2
    same but average all epochs to one   
    '''
    path = f"processeddata/individuals/afterica"
    sub = str(fn).split('/')[-1].split('_')[0]
    session = str(fn).split('/')[-1].split('_')[1]
    label = str(fn).split('/')[-1].split('_')[2]

    raw = mne.io.read_raw_brainvision(fn, preload=True)
    raw = raw.resample(100).filter(l_freq=1, h_freq=40)

    ica = mne.preprocessing.ICA(n_components=20, 
                                random_state=0)
    ica.fit(raw)

    bad_idx_ecg, scores_ecg = ica.find_bads_ecg(raw, 'Fp1', threshold=2)
    bad_idx_eog, scores_eog = ica.find_bads_eog(raw, 'Fp1', threshold=2)
    
    ica.exclude = bad_idx_ecg + bad_idx_eog

    raw_after = ica.apply(raw, 
                          exclude=ica.exclude)

    epochs_after = mne.make_fixed_length_epochs(raw_after,  
                                                duration=50,  
                                                overlap=0)

    fn1 = f"{path}/{label}_{sub}_{session}_approach1.npy"
    fn2 = f"{path}/{label}_{sub}_{session}_approach2.npy"

    np.save(fn1, epochs_after.get_data().astype(np.float16))
    np.save(fn2, epochs_after.average().get_data().astype(np.float16))

In [None]:
def prepare_approach34(fn):
    '''
    approach1
    filter - 1 and 20
    resample - 100hz
    ica - 20 components
    epoch - 50s

    approach2
    same but average all epochs to one   
    '''
    path = f"processeddata/individuals/afterica"
    sub = str(fn).split('/')[-1].split('_')[0]
    session = str(fn).split('/')[-1].split('_')[1]
    label = str(fn).split('/')[-1].split('_')[2]

    raw = mne.io.read_raw_brainvision(fn, preload=True)
    raw = raw.resample(100).filter(l_freq=1, h_freq=20)

    ica = mne.preprocessing.ICA(n_components=20, 
                                random_state=0)
    ica.fit(raw)
    bad_idx_ecg, scores_ecg = ica.find_bads_ecg(raw, 'Fp1', threshold=2)
    bad_idx_eog, scores_eog = ica.find_bads_eog(raw, 'Fp1', threshold=2)
    ica.exclude = bad_idx_ecg + bad_idx_eog

    raw_after = ica.apply(raw, 
                          exclude=ica.exclude)

    epochs_after = mne.make_fixed_length_epochs(raw_after,  
                                                duration=50,  
                                                overlap=0)

    fn1 = f"{path}/{label}_{sub}_{session}_approach3.npy"
    fn2 = f"{path}/{label}_{sub}_{session}_approach4.npy"

    np.save(fn1, epochs_after.get_data().astype(np.float16))
    np.save(fn2, epochs_after.average().get_data().astype(np.float16))

In [None]:
filenames = []
for path in Path('hackdataset').rglob('*.vhdr'):
    filenames.append(path)

In [None]:
%%capture
parallel(prepare_approach12, 
         filenames, 
         n_workers=8, 
         progress=True)

In [None]:
%%capture
parallel(prepare_approach34, 
         filenames, 
         n_workers=8, 
         progress=True)

# Prepare df and Standard Scalar

In [None]:
class SScaler3D(BaseEstimator,TransformerMixin):

    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self,X,y=None):
        self.scaler.fit(X.reshape(X.shape[0], -1))
        return self

    def transform(self,X):
        return self.scaler.transform(X.reshape(X.shape[0], -1)).reshape(X.shape)

In [None]:
def prepare_df_ss(approach):
    fns = glob(f'processeddata/individuals/afterica/*{approach}.npy')

    fns_2 = []
    label = []
    subject = []
    session = []
    for fn in fns:
        fns_2.append(fn)
        label.append(str(fn).split('/')[-1].split('_')[0])
        subject.append(str(fn).split('/')[-1].split('_')[1])
        session.append(str(fn).split('/')[-1].split('_')[2])

    df = pd.DataFrame([fns_2, label, subject, session]).T
    df.columns = ['fns', 'label', 'subject', 'session']

    val_subs = ['sub-51', 'sub-52', 'sub-53', 'sub-54', 'sub-55', 'sub-56', 'sub-57', 'sub-58', 'sub-59', 'sub-60']
    df['is_valid'] = False
    df.loc[df[df['subject'].isin(val_subs)].index, 'is_valid'] = True

    df.to_csv(f'{approach}infos.csv', index=False)

    if approach in ['approach1', 'approach3']:
        features = []
        for fn in df[df['is_valid']==False].fns.values:
            temp = np.load(fn)
            if temp.shape[0] == 6:
                features.append(temp)
        
        features = np.vstack(features)
        ss = SScaler3D()
        ss.fit(features[:int(0.5*features.shape[0])])

        pickle.dump(ss, open(f'ss_{approach}.pkl', 'wb'))

    else:
        features = []
        for fn in df[df['is_valid']==False].fns.values:
            temp = np.load(fn)
            features.append(temp)
        
        features = np.vstack(features)
        ss = StandardScaler()
        ss.fit(features[:int(0.5*features.shape[0])])

        pickle.dump(ss, open(f'ss_{approach}.pkl', 'wb'))

In [None]:
prepare_df_ss('approach1')

In [None]:
prepare_df_ss('approach2')

In [None]:
prepare_df_ss('approach3')

In [None]:
prepare_df_ss('approach4')