<a href="https://colab.research.google.com/github/abelowska/eegML/blob/main/Classes_07_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prediction of perfectionism CMDA subscale from error-related negativity ERP component

In this notebook we're going to play with different pipelines - and thus we're going to implement various `transformers`, i.e., pipeliens steps to transform data.

Install additional libraries

In [None]:
!pip install MNE

Imports

In [3]:
import os
import re
import glob
import os
import sys
import ast
import os.path as op
from collections import defaultdict
from copy import deepcopy
import copy

import pickle
import mne
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, SelectFpr, SelectFdr, SequentialFeatureSelector
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import datasets, linear_model

warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid", palette="deep")


---
## Load data

Loading EEG and questionnaire data. By default create_df_data loads all info from given .csv file but one can specify it by passing a list of desired labels.

In [4]:
def create_df_data(
    dir_path,
    info_filename=None,
    info="all",
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.

    On default, loads a train set: chooses only 80% of participants
    and for each of them chooses 80% of epochs.
    It will choose them deterministically.

    If test_participants is set to True, it will load remaining 20% of participants.

    Parameters
    ----------
    test: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters

    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    print(dir_path)
    header_files_glob = "responses_100_600/*.vhdr"
    print(header_files_glob)

    # extract header files
    header_files = glob.glob(header_files_glob)
    header_files = sorted(header_files)
    print(header_files)

    # create dataframe for results
    go_nogo_data_df = pd.DataFrame()

    for file in header_files:
        # load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*_(\w+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        # exclude those participants who have too few samples
        if len(error) < 5 or len(correct) < 5:
            # not enough data for this participant
            continue

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, participant_epochs, info_filename, info
        )

        # add participant's data to results dataframe
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [5]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' means that bad segments are rejected automatically.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision(file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations(annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    event_dict = {
        "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
        "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
        "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
        "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
        "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
        "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
        "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
        "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10004, 10005, 10009, 10010],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10006, 10007, 10008, 10011],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    # epochs = []
    this_reject_by_annotation = True

    # Read epochs
    epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
        # verbose='CRITICAL',
    )
    
    return epochs

In [6]:
def create_df_from_epochs(
    id, 
    participant_epochs, 
    info_filename, 
    info
):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    participant_epochs: mne Epochs
        epoched eeg data
    info_filename: String
        path to .csv file with questionnaire data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters

    Returns
    -------
    participant_df : pandas.DataFrame

    """

    # create dataframe for participant's questionnaire data
    info_df = pd.DataFrame()

    # extract questionnaire data from .csv file
    if info_filename is not None:
        if info == "all":
            this_info_df = pd.read_csv(info_filename)
        else:
            this_info_df = pd.read_csv(info_filename, usecols=["Demo_kod"] + info)
        info_df = (
            this_info_df.loc[this_info_df["Demo_kod"] == id]
            .reset_index()
            .drop("index", axis=1)
        )
        
    # create dataframe record with participant's data: ID, eeg data, questionnaire data    
    participant_df = pd.DataFrame(
        {
            "id": [id], 
            "epoch": [participant_epochs], 
        }).join(
            info_df
        )

    return participant_df

### Read data

In [7]:
# constants
tmin, tmax = -0.101562, 0.5937525  # Start and end of the segments
signal_frequency = 256
random_state = 42
test_size = 0.3

ERROR = 1
CORRECT = 0

In [8]:
# mount google drive in colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
# display data in folder
!ls gdrive/MyDrive/perfectionism_data

'GNG_perfectionism (1).pkl'  'GNG_perfectionism (6).pkl'
'GNG_perfectionism (2).pkl'   GNG_perfectionism.pkl
'GNG_perfectionism (3).pkl'   picked
'GNG_perfectionism (4).pkl'   responses_100_600.zip
'GNG_perfectionism (5).pkl'   scales


In [10]:
# paths to data
dir_path = "gdrive/MyDrive/perfectionism_data/"
questionnaire_filename = dir_path + "scales/all_scales.csv"

# define dataframe name in a way: TASK_questionnaire
df_name = "GNG_perfectionism"

# check whether pickled data exists
pickled_data_filename = dir_path + df_name + ".pkl"

if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
    pass
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(
        dir_path = dir_path,
        info_filename=questionnaire_filename,
        info="all"
    )

    epochs_df.name = df_name
    
    # save loaded data into a pickle file
    epochs_df.to_pickle(dir_path + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Pickled file found. Loading pickled data...
Done


In [11]:
print(epochs_df.shape)
epochs_df.head()

(138, 11)


Unnamed: 0,id,epoch,Demo_kod,17-Perfect CM-Concern over Mistakes (9 items mean),17-Perfect PS-Personal Standards (7 items mean),17-Perfect PE-Parental Expectations (5 items mean),17-Perfect PC=Parental Criticism (4 items mean),17-Perfect D=Doubts about Actions (4 items mean),17-Perfect O=Organization (6 items mean),17-Perfectionism full scale (mean),17-Perfectionism CMDA
0,AA0303,"<Epochs | 201 events (all good), -0.101562 - ...",AA0303,2.22,2.43,1.8,1.5,2.75,3.83,2.46,4.97
1,AB0601,"<Epochs | 221 events (all good), -0.101562 - ...",AB0601,1.78,2.14,1.8,1.75,2.0,3.33,2.14,3.78
2,AB0612,"<Epochs | 253 events (all good), -0.101562 - ...",AB0612,2.56,1.86,1.4,2.25,2.5,4.0,2.46,5.06
3,AC2011,"<Epochs | 173 events (all good), -0.101562 - ...",AC2011,1.67,4.57,1.2,1.75,3.75,5.0,3.0,5.42
4,AD1308,"<Epochs | 202 events (all good), -0.101562 - ...",AD1308,4.22,4.86,3.4,2.5,4.5,3.67,3.97,8.72


### Prepare data

Check for Nans

In [None]:
epochs_df.isnull().sum()

In [None]:
epochs_df.fillna(epochs_df.mean(), inplace=True)
epochs_df.isnull().sum()

Create X and y sets

In [14]:
X = epochs_df[['epoch']]
y = epochs_df[['17-Perfectionism CMDA']].to_numpy().ravel()

Train-test split

In [15]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=test_size, 
    random_state=random_state
)

---
## Transformers

As mentioned before, Pipelies are objects which chains together operations into one pipe. According to the documentation:  

> Intermediate steps of the pipeline must be [**transforms**](https://scikit-learn.org/stable/glossary.html#term-transformer), that is, they must implement `fit()` and `transform()` methods.

If we want to combine various operations performed on EEG data with pipelines, we need to wrap these operations in Transformer classes that implement `fit()` and `transform()` methods.

In [61]:
# transformer for channel picking

class PickChannels(TransformerMixin, BaseEstimator):
    def __init__(self, channels_list=['Fz']):
        self.channels_list = channels_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        # apply pick_channel method from MNE to element
        pick_channels_ = lambda x: x.pick_channels(ch_names=self.channels_list, ordered=True)

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        # ensure shape of (n_samples,)
        X = X.flatten()
        
        # for each element of list X apply (custom) function pick_channels_
        # in other words - loop over the X applying function pick_channels_
        X = np.array(list(map(pick_channels_, X)))

        return X    

In [62]:
# transformer for epoch trimming

class TrimEpoch(TransformerMixin, BaseEstimator):
    def __init__(self, tmin=None, tmax=None, include_tmax=True, verbose=None):
        self.tmin = tmin
        self.tmax = tmax
        self.include_tmax = include_tmax
        self.verbose = verbose

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        trim_epoch_ = lambda x: x.crop(
            tmin = self.tmin, 
            tmax = self.tmax, 
            include_tmax = self.include_tmax, 
            verbose=self.verbose
        )

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        X = X.flatten()
        X = np.array(list(map(trim_epoch_, X)))
       
        return X

In [63]:
# transformer for selecting events

class SelectEvent(TransformerMixin, BaseEstimator):
    def __init__(self, event_id = ['error_response']):
        self.event_id = event_id

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        select_events_ = lambda x: x[self.event_id]

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        X = X.flatten()
        X = np.array(list(map(select_events_, X)))

        return X

In [64]:
# transformer for evoking

class Evoked(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        average_ = lambda x: x.average()

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        X = X.flatten()
        X = np.array(list(map(average_, X)))

        return X

In [65]:
# transformer for getting data

class ExtractData(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return shape of (n_channels, n_timepoints)
        get_data_ = lambda x: x.get_data()

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        X = X.flatten()
        X = np.array(list(map(get_data_, X)))

        return X

In [66]:
# transformer for mean amplitude in time-window

class MeanAmplitudePerChannel(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
      
      if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

      X = np.mean(X, axis=-1)

      return X

In [67]:
# transformer for reshaping to (n_samples, n_features)

class Reshape(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        n_samples = X.shape[0]

        if not isinstance(X, np.ndarray):
          raise TypeError("Wrong data fromat. Require: ndarray ")

        # reshape to (n_samples, n_features)
        X = X.reshape(n_samples, -1)
       
        return X

In [68]:
# transformer for spatial filter reshape
# require 3-D data: epochs x channels x timepoints

class SpatialFilterPreprocessing(TransformerMixin, BaseEstimator):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
      
      if not isinstance(X, np.ndarray):
        raise TypeError("Wrong data fromat. Require: ndarray ")

      # join data from each epoch. Shape: channels (n_features) x timepoints*epochs (n_samples)
      timepoints_per_channel = np.concatenate(X, axis=1)

      # create input vector for spatial filter training: array-like of shape (n_samples, n_features)
      spatial_filter_input_data = timepoints_per_channel.T

      return spatial_filter_input_data

In [69]:
# X in spatial-filter shape: n_samples x n_features
# Recovers shape: epoch x channel(spatial_filter_component) x timepoints
class SpatialFilterPostprocessing(TransformerMixin, BaseEstimator):
    def __init__(self, timepoints_count):
        super().__init__()
        self.timepoints_count = timepoints_count

    def fit(self, X, y=None):
        return self

    def transform(self, X):
      
      if not isinstance(X, np.ndarray):
        raise TypeError("Wrong data fromat. Require: ndarray ")

      # print(f"AFTER SPOC SHAPE:{X.shape}")
      # reshape to n_features x n_samples
      X_transposed = X.T

      # get number of created components(n_features)
      spatial_filter_n_components = X.shape[1]

      # get number of epochs: n_samples = epochs*timepoints -> epochs = n_samples / timepoints
      n_epochs = int(X.shape[0] / self.timepoints_count)

      # retrieve shape of epochs x n_components x timepoints
      data_channel_wise = X_transposed.reshape(
          spatial_filter_n_components, n_epochs, self.timepoints_count
      )
      data_epoch_wise = np.transpose(data_channel_wise, (1, 0, 2))

      return np.array(data_epoch_wise)

In [70]:
class PeakToPeak(TransformerMixin, BaseEstimator):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):

      if not isinstance(X, np.ndarray):
        raise TypeError("Wrong data fromat. Require: ndarray ")
       # TODO: search for the highest and the lowest values in each channel for each participant

      return X

In [71]:
class Bin(TransformerMixin, BaseEstimator):
    def __init__(self, bin_width=0.05):
        super().__init__()
        self.bin_width = bin_width

    def fit(self, X, y=None):
        return self

    def transform(self, X):

      if not isinstance(X, np.ndarray):
        raise TypeError("Wrong data fromat. Require: ndarray ")
        
       # TODO: for each channel and each participant average the signal within bin_width time window.
       # Assume that bin_width parameter is in s.

      return X

Method for model evaluation

In [28]:
def evaluate_model_grid_search(
    pipe,
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    regressor_params,
    pipeline_name,
    cv=KFold(n_splits=3),
    predict_test = True,
    predict_train = True,
    ):
    
    # define grid search
    grid_search_model = GridSearchCV(
        pipe,
        regressor_params,
        cv=cv,
        scoring={"r2", "neg_mean_absolute_error", "neg_mean_squared_error"},
        refit="r2",
        return_train_score=True,
        verbose=3,
    )

    # fit model
    grid_search_model.fit(X_train, y_train)

    # predict test data
    y_test_pred = grid_search_model.predict(X_test) if predict_test is True else None
    test_score = r2_score(y_test, y_test_pred) if predict_test is True else None

    # predict train data
    y_train_pred = grid_search_model.predict(X_train) if predict_train is True else None
    train_score = r2_score(y_train, y_train_pred) if predict_train is True else None 

    # extract mean cv scores
    mean_cv_score = grid_search_model.best_score_

    # extract splits scores
    cv_results_df = pd.DataFrame(grid_search_model.cv_results_).iloc[[grid_search_model.best_index_]]
    cv_splits_scores_df = cv_results_df.filter(regex=r"split\d*_test_r2").reset_index(drop=True)

    # save results in dataframe
    this_result = pd.concat(
        [
            pd.DataFrame({
            "model_name": [pipe.steps[-1][0]],
            "pipeline_name": [pipeline_name],
            "train score": [train_score],
            "mean_cv_score": [mean_cv_score],
            "test_score": [test_score],
            "best_model": [grid_search_model.best_estimator_],
            "parameters": [grid_search_model.best_params_],
            }),
         cv_splits_scores_df
        ],
    axis=1
    ) 

    return this_result

Models to fit

In [29]:
ln = ("ln", LinearRegression())
ln_params = dict()

svr = ("svr", SVR())
svr_params = dict(
    svr__kernel=["linear", "rbf"],
    svr__C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 50, 100],
    svr__epsilon=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 5, 10],
)

knn = ("knn", KNeighborsRegressor())
knn_params = dict(
    knn__n_neighbors=np.arange(1,50,2),
    knn__weights=['uniform', 'distance']
)

estimators = [
    (ln, ln_params),
    (svr, svr_params),
    (knn, knn_params),
]

## Let's think how we can featurise our EEG signal :)

Ideas:
- ...
- ...
- ...

## My pre-defined Pipelines :)

In [30]:
results_df = pd.DataFrame()

### Pipeline 1 - channels & mean amplitude:
- channels: FCz
- time-window: 0-100ms
- feature: mean amplitude

In [72]:
pipeline_name = "FCz_0-100_mean_amplitude"

Create copy of X for multiple pre-processing

In [81]:
X_train_copy = pd.DataFrame(copy.deepcopy(X_train.to_dict()))
X_test_copy = pd.DataFrame(copy.deepcopy(X_test.to_dict()))

# extract epochs from df to ndarray
X_train_copy = X_train_copy.to_numpy()
X_test_copy = X_test_copy.to_numpy()

1. To speed-up computations, those steps that do not need be optimised might be performed before grid search. These are ours base steps:

In [82]:
# define base steps

base_steps = [
      ('pick_channel', PickChannels(['FCz'])),
      ('trim_epoch', TrimEpoch(tmin=0, tmax=0.1)),
      ('select_event', SelectEvent(event_id=['error_response'])),
      ('evoked', Evoked()),
      ('get_data', ExtractData()),
      ('avg', MeanAmplitudePerChannel()),
      ('reshape', Reshape()),
      ('scaler', StandardScaler()), 
]

base_pipeline = Pipeline(base_steps)

In [83]:
# transform X train and test with base pipeline

X_train_copy_transformed = base_pipeline.fit_transform(X_train_copy)
X_test_copy_transformed = base_pipeline.transform(X_test_copy)

print(f"X train set shape: {X_train_copy_transformed.shape}")
print(f"X test set shape: {X_test_copy_transformed.shape}")

X train set shape: (96, 2)
X test set shape: (42, 2)


2. Evaluate models

In [None]:
for (estimator, params) in estimators:
  print(f"Rating {estimator} \n")

  # create pipeline from base steps list and estimator
  pipe = Pipeline([estimator])

  # enter to grid search
  this_results = evaluate_model_grid_search(
      pipe,
      X_train_copy_transformed,
      y_train,
      X_test_copy_transformed,
      y_test,
      params,
      pipeline_name=pipeline_name,
      predict_test = True,
      predict_train = True,
  )

  results_df = results_df.append(this_results)

In [None]:
results_df

### Pipeline 2 - channels & peak-to-peak amplitude: 
- channels: Fz
- time-window: 0-100ms
- feature: peak-to-peak amplitude

In [None]:
pipeline_name = "Fz_0-100_peak_to_peak"

Create copy of X for multiple pre-processing

In [141]:
X_train_copy = pd.DataFrame(copy.deepcopy(X_train.to_dict()))
X_test_copy = pd.DataFrame(copy.deepcopy(X_test.to_dict()))

# extract epochs from df to ndarray
X_train_copy = X_train_copy.to_numpy()
X_test_copy = X_test_copy.to_numpy()

1. To speed-up computations, those steps that do not need be optimised might be performed before grid search. These are ours base steps:

In [79]:
# define base steps

base_steps = [
      ('pick_channel', PickChannels(['Fz'])),
      ('trim_epoch', TrimEpoch(tmin=0, tmax=0.1)),
      ('select_event', SelectEvent(event_id=['error_response'])),
      ('evoked', Evoked()),
      ('get_data', ExtractData()),
      # ('peak-to-peak', PeakToPeak()), # TODO
      ('reshape', Reshape()),
      ('scaler', StandardScaler()), 
]

base_pipeline = Pipeline(base_steps)

In [80]:
# transform X train and test with base pipeline

X_train_copy_transformed = base_pipeline.fit_transform(X_train_copy)
X_test_copy_transformed = base_pipeline.transform(X_test_copy)

print(f"X train set shape: {X_train_copy_transformed.shape}")
print(f"X test set shape: {X_test_copy_transformed.shape}")

2. Evaluate model

In [None]:
for (estimator, params) in estimators:
  print(f"Rating {estimator} \n")

  # create pipeline from base steps list and estimator
  pipe = Pipeline([estimator])

  # enter to grid search
  this_results = evaluate_model_grid_search(
      pipe,
      X_train_copy_transformed,
      y_train,
      X_test_copy_transformed,
      y_test,
      params,
      pipeline_name=pipeline_name,
      predict_test = True,
      predict_train = True,
  )

  results_df = results_df.append(this_results)

In [86]:
results_df

Unnamed: 0,model_name,pipeline_name,train score,mean_cv_score,test_score,best_model,parameters,split0_test_r2,split1_test_r2,split2_test_r2
0,ln,FCz_0-100_mean_amplitude,0.022035,-0.140625,0.049269,(LinearRegression()),{},-0.088205,-0.354046,0.020376
0,svr,FCz_0-100_mean_amplitude,-0.002543,-0.077124,0.093865,"(SVR(C=1, epsilon=2, kernel='linear'))","{'svr__C': 1, 'svr__epsilon': 2, 'svr__kernel'...",-0.125013,-0.206596,0.100237
0,knn,FCz_0-100_mean_amplitude,0.00084,-0.113897,-0.014011,(KNeighborsRegressor(n_neighbors=49)),"{'knn__n_neighbors': 49, 'knn__weights': 'unif...",-0.107084,-0.248381,0.013775
0,ln,Fz_0-100_peak_to_peak,0.265114,-1.039627,0.006122,(LinearRegression()),{},-0.863369,-1.213246,-1.042266
0,svr,Fz_0-100_peak_to_peak,-4.1e-05,-0.089083,-0.008669,"(SVR(C=1e-05, epsilon=1))","{'svr__C': 1e-05, 'svr__epsilon': 1, 'svr__ker...",-0.107622,-0.158919,-0.000709
0,knn,Fz_0-100_peak_to_peak,0.019391,-0.115004,0.04652,(KNeighborsRegressor(n_neighbors=31)),"{'knn__n_neighbors': 31, 'knn__weights': 'unif...",-0.084702,-0.287861,0.027552


### Pipeline 3 - PCA: 
- channels: F1, Fz, F2, FC1, FCz, FC2, C1, Cz, C2, CP1, CPz, CP2;
- time-window: 0-100ms
- PCA
- feature: mean amplitude/peak-to-peak

In [None]:
pipeline_name = "PCA_0-100_mean_amplitude"

Create copy of X for multiple pre-processing

In [160]:
X_train_copy = pd.DataFrame(copy.deepcopy(X_train.to_dict()))
X_test_copy = pd.DataFrame(copy.deepcopy(X_test.to_dict()))

# extract epochs from df to ndarray
X_train_copy = X_train_copy.to_numpy()
X_test_copy = X_test_copy.to_numpy()

1. To speed-up computations, those steps that do not need be optimised might be performed before grid search. These are ours base steps:

In [162]:
# define base steps
channels = ['F1', 'Fz', 'F2', 'FC1', 'FCz', 'FC2', 'C1', 'Cz', 'C2', 'CP1', 'CPz', 'CP2']

base_steps = [
      ('pick_channel', PickChannels(channels)),
      ('trim_epoch', TrimEpoch(tmin=0, tmax=0.2)),
      ('select_event', SelectEvent(event_id=['error_response'])),
      ('evoked', Evoked()),
      ('get_data', ExtractData()),
]

# create base pipeline
base_pipeline = Pipeline(base_steps)

In [163]:
# transform X train and test with base pipeline
X_train_copy_transformed = base_pipeline.fit_transform(X_train_copy)
X_test_copy_transformed = base_pipeline.transform(X_test_copy)

print(f"X train set shape: {X_train_copy_transformed.shape}")
print(f"X test set shape: {X_test_copy_transformed.shape}")

2. Create main pre-processign pipeline:

In [170]:
timepoints_count = X_train_copy_transformed.shape[-1]

steps =[
    ('reshape_for_PCA', SpatialFilterPreprocessing()),
    ("spatial_feature_extraction",PCA(random_state=random_state)),
    ("reshape_after_PCA", SpatialFilterPostprocessing(timepoints_count=timepoints_count)),
    ('avg', MeanAmplitudePerChannel()),
    ('reshape', Reshape()),
    ('scaler', StandardScaler()), 
]

In [181]:
# define parameters of the main pipeline
pipeline_params = dict(
    spatial_feature_extraction__n_components = np.arange(1,5)
)

3. Evaluate model

In [None]:
for (estimator, params) in estimators:
  print(f"Rating {estimator} \n")

  # create pipeline from base steps list and estimator
  pipe = Pipeline(steps + [estimator])

  # create params: join estimator parameters and main pipeline parameters
  parameters = params.copy()
  parameters.update(pipeline_params)

  # enter to grid search
  this_results = evaluate_model_grid_search(
      pipe,
      X_train_copy_transformed,
      y_train,
      X_test_copy_transformed,
      y_test,
      parameters,
      pipeline_name=pipeline_name,
      predict_test = True,
      predict_train = True,
  )

  results_df = results_df.append(this_results)

### Pipeline 4 - channels & bins: 
- channels: Fz
- time-window: 0-100ms
- 50ms bins
- feature: peak-to-peak amplitude

In [None]:
pipeline_name = "Fz_0-100_bins_peak_to_peak"

Create copy of X for multiple pre-processing

In [None]:
X_train_copy = pd.DataFrame(copy.deepcopy(X_train.to_dict()))
X_test_copy = pd.DataFrame(copy.deepcopy(X_test.to_dict()))

# extract epochs from df to ndarray
X_train_copy = X_train_copy.to_numpy()
X_test_copy = X_test_copy.to_numpy()

1. To speed-up computations, those steps that do not need be optimised might be performed before grid search. These are ours base steps:

In [None]:
# define base steps

base_steps = [
      ('pick_channel', PickChannels(['Fz'])),
      ('trim_epoch', TrimEpoch(tmin=0, tmax=0.1)),
      ('select_event', SelectEvent(event_id=['error_response'])),
      ('evoked', Evoked()),
      ('get_data', ExtractData()),
      # ('bin', Bin()), # TODO
      # ('peak-to-peak', PeakToPeak()), # TODO
      ('reshape', Reshape()),
      ('scaler', StandardScaler()), 
]

base_pipeline = Pipeline(base_steps)

In [None]:
# transform X train and test with base pipeline

X_train_copy_transformed = base_pipeline.fit_transform(X_train_copy)
X_test_copy_transformed = base_pipeline.transform(X_test_copy)

print(f"X train set shape: {X_train_copy_transformed.shape}")
print(f"X test set shape: {X_test_copy_transformed.shape}")

2. Evaluate model

In [None]:
for (estimator, params) in estimators:
  print(f"Rating {estimator} \n")

  # create pipeline from base steps list and estimator
  pipe = Pipeline([estimator])

  # enter to grid search
  this_results = evaluate_model_grid_search(
      pipe,
      X_train_copy_transformed,
      y_train,
      X_test_copy_transformed,
      y_test,
      params,
      pipeline_name=pipeline_name,
      predict_test = True,
      predict_train = True,
  )

  results_df = results_df.append(this_results)

In [None]:
results_df

Unnamed: 0,model_name,pipeline_name,train score,mean_cv_score,test_score,best_model,parameters,split0_test_r2,split1_test_r2,split2_test_r2
0,ln,FCz_0-100_mean_amplitude,0.022035,-0.140625,0.049269,(LinearRegression()),{},-0.088205,-0.354046,0.020376
0,svr,FCz_0-100_mean_amplitude,-0.002543,-0.077124,0.093865,"(SVR(C=1, epsilon=2, kernel='linear'))","{'svr__C': 1, 'svr__epsilon': 2, 'svr__kernel'...",-0.125013,-0.206596,0.100237
0,knn,FCz_0-100_mean_amplitude,0.00084,-0.113897,-0.014011,(KNeighborsRegressor(n_neighbors=49)),"{'knn__n_neighbors': 49, 'knn__weights': 'unif...",-0.107084,-0.248381,0.013775
0,ln,Fz_0-100_peak_to_peak,0.265114,-1.039627,0.006122,(LinearRegression()),{},-0.863369,-1.213246,-1.042266
0,svr,Fz_0-100_peak_to_peak,-4.1e-05,-0.089083,-0.008669,"(SVR(C=1e-05, epsilon=1))","{'svr__C': 1e-05, 'svr__epsilon': 1, 'svr__ker...",-0.107622,-0.158919,-0.000709
0,knn,Fz_0-100_peak_to_peak,0.019391,-0.115004,0.04652,(KNeighborsRegressor(n_neighbors=31)),"{'knn__n_neighbors': 31, 'knn__weights': 'unif...",-0.084702,-0.287861,0.027552


### Pipeline 5 - PCA & bins: 
- channels: F1, Fz, F2, FC1, FCz, FC2, C1, Cz, C2, CP1, CPz, CP2;
- time-window: 0-100ms
- PCA
- 50ms bins
- feature: mean/peak-to-peak amplitude

In [None]:
pipeline_name = "PCA_0-100_bins_mean_amplitude"

Create copy of X for multiple pre-processing

In [None]:
X_train_copy = pd.DataFrame(copy.deepcopy(X_train.to_dict()))
X_test_copy = pd.DataFrame(copy.deepcopy(X_test.to_dict()))

# extract epochs from df to ndarray
X_train_copy = X_train_copy.to_numpy()
X_test_copy = X_test_copy.to_numpy()

1. To speed-up computations, those steps that do not need be optimised might be performed before grid search. These are ours base steps:

In [None]:
# define base steps
channels = ['F1', 'Fz', 'F2', 'FC1', 'FCz', 'FC2', 'C1', 'Cz', 'C2', 'CP1', 'CPz', 'CP2']

base_steps = [
      ('pick_channel', PickChannels(channels)),
      ('trim_epoch', TrimEpoch(tmin=0, tmax=0.2)),
      ('select_event', SelectEvent(event_id=['error_response'])),
      ('evoked', Evoked()),
      ('get_data', ExtractData()),
]

# create base pipeline
base_pipeline = Pipeline(base_steps)

In [None]:
# transform X train and test with base pipeline
X_train_copy_transformed = base_pipeline.fit_transform(X_train_copy)
X_test_copy_transformed = base_pipeline.transform(X_test_copy)

print(f"X train set shape: {X_train_copy_transformed.shape}")
print(f"X test set shape: {X_test_copy_transformed.shape}")

2. Create main pre-processign pipeline:

In [None]:
timepoints_count = X_train_copy_transformed.shape[-1]

steps =[
    ('reshape_for_PCA', SpatialFilterPreprocessing()),
    ("spatial_feature_extraction",PCA(random_state=random_state)),
    ("reshape_after_PCA", SpatialFilterPostprocessing(timepoints_count=timepoints_count)),
    # ('bin', Bin()), # TODO
    ('avg', MeanAmplitudePerChannel()),
    ('reshape', Reshape()),
    ('scaler', StandardScaler()), 
]

In [None]:
# define parameters of the main pipeline
pipeline_params = dict(
    spatial_feature_extraction__n_components = np.arange(1,5)
)

3. Evaluate model

In [None]:
for (estimator, params) in estimators:
  print(f"Rating {estimator} \n")

  # create pipeline from base steps list and estimator
  pipe = Pipeline(steps + [estimator])

  # create params: join estimator parameters and main pipeline parameters
  parameters = params.copy()
  parameters.update(pipeline_params)

  # enter to grid search
  this_results = evaluate_model_grid_search(
      pipe,
      X_train_copy_transformed,
      y_train,
      X_test_copy_transformed,
      y_test,
      parameters,
      pipeline_name=pipeline_name,
      predict_test = True,
      predict_train = True,
  )

  results_df = results_df.append(this_results)

In [None]:
results_df