# Rumination prediction

### Imports

In [19]:
import os
import re
import glob
import os
import ast
import os.path as op
from collections import defaultdict
from copy import deepcopy

import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.base import TransformerMixin, BaseEstimator


import sys

# sys.path.append("..")
# from utils import *

---
## Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [20]:
# paths TODO
dir_path = os.path.dirname(os.path.abspath(""))

In [21]:
tmin, tmax = -0.1, 0.6  # Start and end of the segments
signal_frequency = 256
ERROR = 0
CORRECT = 1
random_state = 0

In [22]:
channels_order_list = [
    "Fp1",
    "AF7",
    "AF3",
    "F1",
    "F3",
    "F5",
    "F7",
    "FT7",
    "FC5",
    "FC3",
    "FC1",
    "C1",
    "C3",
    "C5",
    "T7",
    "TP7",
    "CP5",
    "CP3",
    "CP1",
    "P1",
    "P3",
    "P5",
    "P7",
    "P9",
    "PO7",
    "PO3",
    "O1",
    "Iz",
    "Oz",
    "POz",
    "Pz",
    "CPz",
    "Fpz",
    "Fp2",
    "AF8",
    "AF4",
    "AFz",
    "Fz",
    "F2",
    "F4",
    "F6",
    "F8",
    "FT8",
    "FC6",
    "FC4",
    "FC2",
    "FCz",
    "Cz",
    "C2",
    "C4",
    "C6",
    "T8",
    "TP8",
    "CP6",
    "CP4",
    "CP2",
    "P2",
    "P4",
    "P6",
    "P8",
    "P10",
    "PO8",
    "PO4",
    "O2",
]

channels_dict = dict(zip(channels_order_list, np.arange(1, 64, 1)))

In [23]:
def create_df_data(
    test_participants=False,
    test_epochs=False,
    info_filename=None,
    info="all",
    personal=True,
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.

    On default, loads a train set: chooses only 80% of participants
    and for each of them chooses 80% of epochs.
    It will choose them deterministically.

    Participants with less than 10 epochs per condition are rejected.

    If test_participants is set to True, it will load remaining 20% of participants.
    If test_epochs is set to True, it will load remaining 20% of epochs.
    Test epochs are chronologically after train epochs,
    because it reflects real usage (first callibration and then classification).

    Parameters
    ----------
    test_participants: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    test_epochs: bool
        whether load data for training or final testing.
        If true load epochs of each participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters
    personal: bool
        whether a model will be both trained and tested on epochs from one person
        if false, person's epochs aren't split into test and train


    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    print(os.path.abspath(""))
    dir_path = os.path.dirname(os.path.abspath(""))
    print(dir_path)
    header_files_glob = os.path.join(dir_path, "data/responses/*.vhdr")
    header_files = glob.glob(header_files_glob)

    header_files = sorted(header_files)
    go_nogo_data_df = pd.DataFrame()

    # cut 20% of data for testing
    h_train, h_test = train_test_split(header_files, test_size=0.2, random_state=0)

    if test_participants:
        header_files = h_test
    else:
        header_files = h_train

    for file in header_files:
        #  load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*_(\w+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        # exclude those participants who have too few samples
        if len(error) < 5 or len(correct) < 5:
            # not enough data for this participant
            continue

        if personal:
            # cut 20% of each participant's epochs for testing
            # shuffling is disabled to make sure test epochs are after train epochs
            # TODO: not sure if this step is necessary
            err_train, err_test = train_test_split(error, test_size=0.2, shuffle=False)
            cor_train, cor_test = train_test_split(
                correct, test_size=0.2, shuffle=False
            )
            if test_epochs:
                error = err_test
                correct = cor_test
            else:
                error = err_train
                correct = cor_train

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, correct, error, info_filename, info
        )
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [24]:
def create_df_from_epochs(id, correct, error, info_filename, info):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.
    Default info extracted form .csv file is 'Rumination Full Scale' and participants' ids.
    With this info df structure is like:
    {id: String ; epoch: epoch_data ; marker: 1.0|0.0 ; File: id ; 'Rumination Full Scale': int}

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    correct: array
        correct responses' data
    error: array
        error responses' data
    info_filename: String
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters

    Returns
    -------
    participant_df : pandas.DataFrame

    """
    participant_df = pd.DataFrame()
    info_df = pd.DataFrame()

    # get additional info from file
    if info_filename is not None:
        if info == "all":
            rumination_df = pd.read_csv(info_filename)
        else:
            rumination_df = pd.read_csv(info_filename, usecols=["File"] + info)
        info_df = (
            rumination_df.loc[rumination_df["File"] == id]
            .reset_index()
            .drop("index", axis=1)
        )

    for epoch in correct:
        epoch_df = pd.DataFrame(
            {"id": [id], "epoch": [epoch], "marker": [CORRECT]}
        ).join(info_df)
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    for epoch in error:
        epoch_df = pd.DataFrame({"id": [id], "epoch": [epoch], "marker": [ERROR]}).join(
            info_df
        )
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    return participant_df

In [25]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' means that bad segments are rejected automatically.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision(file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations(annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    event_dict = {
        "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
        "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
        "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
        "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
        "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
        "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
        "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
        "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10004, 10005, 10009, 10010],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10006, 10007, 10008, 10011],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    epochs = []
    bads = []
    this_reject_by_annotation = True

    # Read epochs
    epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
    )

    return epochs

#### Read the data

In [26]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Pickled file not found. Loading data...
/Users/anuszka/Programming/rumination_project/erpinator/notebooks
/Users/anuszka/Programming/rumination_project/erpinator
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AR0607-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AR0607-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG']
Not setting metadata
Not setting metadata
237 matching events found
No baseline correction applied
0 projection items activated
Loading data for 237 events and 181 original time points ...
50 bad epochs dropped
AR0607
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_HN2708-64 

  raw = mne.io.read_raw_brainvision(file)
  raw.set_annotations(annotations)


103 bad epochs dropped
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_JL2111-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_JL2111-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*1_c_2*R', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG', 'Stimulus/RE*ex*2_n*2_c_2*R']
Not setting metadata
Not setting metadata
277 matching events found
No baseline correction applied
0 projection items activated
Loading data for 277 events and 181 original time points ...
185 bad epochs dropped
JL2111
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_MZ0502-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file:

  raw = mne.io.read_raw_brainvision(file)
  raw.set_annotations(annotations)


146 bad epochs dropped
AO2601
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AK2012-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AK2012-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*1_c_2*R', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG', 'Stimulus/RE*ex*2_n*2_c_2*R']
Not setting metadata
Not setting metadata
262 matching events found
No baseline correction applied
0 projection items activated
Loading data for 262 events and 181 original time points ...
167 bad epochs dropped
AK2012
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_KD1102-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from heade

  raw = mne.io.read_raw_brainvision(file)
  raw.set_annotations(annotations)


118 bad epochs dropped
PC2411
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AK1507-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AK1507-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*1_c_2*R', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG']
Not setting metadata
Not setting metadata
225 matching events found
No baseline correction applied
0 projection items activated
Loading data for 225 events and 181 original time points ...
104 bad epochs dropped
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_AA0303-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/ru

  raw = mne.io.read_raw_brainvision(file)
  raw.set_annotations(annotations)


240 bad epochs dropped
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_SR0807-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_SR0807-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*1_c_2*R', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG', 'Stimulus/RE*ex*2_n*2_c_2*R']
Not setting metadata
Not setting metadata
238 matching events found
No baseline correction applied
0 projection items activated
Loading data for 238 events and 181 original time points ...
43 bad epochs dropped
SR0807
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_DG1409-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: 

  raw = mne.io.read_raw_brainvision(file)
  raw.set_annotations(annotations)


72 bad epochs dropped
TZ2302
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_BW1003-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header file: /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_BW1003-64 el.vhdr
Used Annotations descriptions: ['Stimulus/RE*ex*1_n*1_c_1*R*FB', 'Stimulus/RE*ex*1_n*1_c_1*R*FG', 'Stimulus/RE*ex*1_n*1_c_2*R', 'Stimulus/RE*ex*1_n*2_c_1*R', 'Stimulus/RE*ex*2_n*1_c_1*R', 'Stimulus/RE*ex*2_n*2_c_1*R*FB', 'Stimulus/RE*ex*2_n*2_c_1*R*FG', 'Stimulus/RE*ex*2_n*2_c_2*R']
Not setting metadata
Not setting metadata
251 matching events found
No baseline correction applied
0 projection items activated
Loading data for 251 events and 181 original time points ...
21 bad epochs dropped
BW1003
Extracting parameters from /Users/anuszka/Programming/rumination_project/erpinator/data/responses/GNG_IA2105-64 el.vhdr...
Setting channel info structure...
Finding 'sfreq' from header 

#### Rearrange data:  from: *one row - one epoch* to *one row - one participant* 

epochs column contain list of epochs from given condition (marker = error or correct)

In [27]:
epochs_df

Unnamed: 0,id,epoch,marker,File,Sex,Handedness,Age,DASS-21 Stress scale,DASS-21 Depression scale,DASS-21 Anxiety scale,...,Number Error,Number Inhibited Response,Trials sum,mean Post-Hit RT,mean Post-Fast-Hit RT,mean Post-Slow-Hit RT,mean Post-Error RT,mean post-Inhibited Response,mean Post-Error RT minus mean Post-Hit RT,mean Post-Error RT minus mean Post-Inhibited Response
0,AR0607,"[[-4.607e-06, 1.95e-06, 1.0529999999999999e-05...",1,AR0607,0,1,22,1.14,1.57,1.14,...,17,95,336,0.38784,0.403654,0.370218,0.471611,0.424927,0.083771,0.046683
1,AR0607,"[[8.2e-08, -6.307e-06, -6.5709999999999994e-06...",1,AR0607,0,1,22,1.14,1.57,1.14,...,17,95,336,0.38784,0.403654,0.370218,0.471611,0.424927,0.083771,0.046683
2,AR0607,"[[8.9e-07, 3.8999999999999997e-07, -2.45e-07, ...",1,AR0607,0,1,22,1.14,1.57,1.14,...,17,95,336,0.38784,0.403654,0.370218,0.471611,0.424927,0.083771,0.046683
3,AR0607,"[[-6.958e-06, -3.568e-06, 6.4e-07, 1.074e-06, ...",1,AR0607,0,1,22,1.14,1.57,1.14,...,17,95,336,0.38784,0.403654,0.370218,0.471611,0.424927,0.083771,0.046683
4,AR0607,"[[-4.1029999999999995e-06, -4.769999999999999e...",1,AR0607,0,1,22,1.14,1.57,1.14,...,17,95,336,0.38784,0.403654,0.370218,0.471611,0.424927,0.083771,0.046683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18540,DO0607,"[[-2.6309999999999995e-06, -2.278e-06, 3.70299...",0,DO0607,1,1,30,2.00,1.57,2.00,...,20,92,336,0.29637,0.303883,0.287442,0.347652,0.308988,0.051281,0.038664
18541,DO0607,"[[-2.805e-06, -4.706e-06, 8.38e-07, 1.00099999...",0,DO0607,1,1,30,2.00,1.57,2.00,...,20,92,336,0.29637,0.303883,0.287442,0.347652,0.308988,0.051281,0.038664
18542,DO0607,"[[-9.005000000000001e-06, -1.1927e-05, -9.9039...",0,DO0607,1,1,30,2.00,1.57,2.00,...,20,92,336,0.29637,0.303883,0.287442,0.347652,0.308988,0.051281,0.038664
18543,DO0607,"[[-1.1813999999999999e-05, -1.2176e-05, -1.205...",0,DO0607,1,1,30,2.00,1.57,2.00,...,20,92,336,0.29637,0.303883,0.287442,0.347652,0.308988,0.051281,0.038664


In [28]:
data_df = (
    epochs_df.groupby(
        ["id", "marker"],
        sort=False,
    )
    .apply(
        lambda group_df: pd.Series(
            {
                "epochs": np.array(group_df["epoch"].to_list(), dtype="float64"),
                "Rumination Full Scale": np.mean(group_df["Rumination Full Scale"]),
                "Anxiety": np.mean(group_df["DASS-21 Anxiety scale"]),
                "Stress": np.mean(group_df["DASS-21 Stress scale"]),
                "Depression": np.mean(group_df["DASS-21 Depression scale"]),
            }
        )
    )
    .reset_index()
)

In [29]:
summary_df = (
    epochs_df.groupby(
        ["id", "marker"],
        sort=False,
    )
    .size()
    .reset_index(name="counts")
)

participants_data_len = np.array(
    summary_df[summary_df["marker"] == 0]["counts"].tolist()
)

In [30]:
# participant data indices for identifying participants data after spatial filtering

participants_data_indices = []
index = 0

for participant_len in participants_data_len:
    participant_indices = (index, index + participant_len - 1)
    participants_data_indices.append(participant_indices)
    index = index + participant_len

participants_data_indices = np.array(participants_data_indices)

---
## Training and prediction

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from tempfile import mkdtemp
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings

warnings.filterwarnings("ignore")

#### Create X train and y train sets

In [32]:
# selection of the analysed condition: erroneous responses or correct responses
dataset = ERROR
dataset_name = "correct" if dataset == CORRECT else "error"

In [33]:
# shape 4-D: participant x epoch x channel x timepoints
X_train = np.array(data_df[data_df["marker"] == dataset]["epochs"].tolist())

# shape 1-D: rumination score
rumination = np.array(
    data_df[data_df["marker"] == dataset]["Rumination Full Scale"].to_list()
)

anxiety = np.array(data_df[data_df["marker"] == dataset]["Anxiety"].to_list())
stress = np.array(data_df[data_df["marker"] == dataset]["Stress"].to_list())
depression = np.array(data_df[data_df["marker"] == dataset]["Depression"].to_list())

# y_train = np.stack((rumination, depresion), axis=1)
y_train = rumination

# X_train = np.array(epochs_df[epochs_df["marker"] == dataset]["epoch"].tolist())

# # shape 1-D: rumination score
# y_train = np.array(
#     epochs_df[epochs_df["marker"] == dataset]["Rumination Full Scale"].to_list()
# )

In [34]:
X_test = []
y_test = []

In [35]:
X_train.shape

(108,)

----

In [36]:
rum_depr_corr = np.corrcoef(rumination, depression)
rum_depr_corr

array([[1.        , 0.47601841],
       [0.47601841, 1.        ]])

In [37]:
rum_anx_corr = np.corrcoef(rumination, anxiety)
rum_anx_corr

array([[1.       , 0.2259998],
       [0.2259998, 1.       ]])

In [38]:
rum_stress_corr = np.corrcoef(rumination, stress)
rum_stress_corr

array([[1.        , 0.48562839],
       [0.48562839, 1.        ]])

---
### Experiments 

Parameters of experiments:
- regressors
- hyperparameters
- preprocessing pipelines

#### Prepare experiment estimating 
____

In [39]:
# Rating model with grid search


def rate_regressor(
    X_train, y_train, X_test, y_test, regressor, regressor_params, base_steps, cv=3
):
    # define cross-validation method
    cv_kf = KFold(n_splits=3)

    pipeline = Pipeline(steps=base_steps + [regressor])
    param_grid = regressor_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv_kf,
        scoring={"r2", "neg_mean_absolute_error", "neg_mean_squared_error"},
        refit="r2",
        return_train_score=True,
        n_jobs=4,
        verbose=1,
        error_score="raise",
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [40]:
# conducting experiment and saving selected info do result df


def run_experiment(
    tested_regressors,
    regressor_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
):

    for (regressor, params) in tested_regressors:
        print(f"Rating {regressor} \n")
        tested_params = {**regressor_params, **params}

        # enter to grid search
        grid_result = rate_regressor(
            X_train,
            y_train,
            X_test,
            y_test,
            regressor,
            tested_params,
            base_steps,
            cv=3,
        )

        #     predictions = grid_result.predict(X_test)
        #     r2 = grid_result.score(X_test, y_test)
        #     mae = mean_absolute_error(y_test, predictions)
        #     r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

        best_estimator_index = grid_result.best_index_
        mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
        std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
        mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "mean_test_neg_mean_absolute_error"
        ][best_estimator_index]
        std_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "std_test_neg_mean_absolute_error"
        ][best_estimator_index]
        mean_cv_neg_mean_squared_error = grid_result.cv_results_[
            "mean_test_neg_mean_squared_error"
        ][best_estimator_index]
        std_cv_neg_mean_squared_error = grid_result.cv_results_[
            "std_test_neg_mean_squared_error"
        ][best_estimator_index]
        
        mean_train_r2 = grid_result.cv_results_["mean_train_r2"][best_estimator_index]
        mean_train_mae = grid_result.cv_results_["mean_train_neg_mean_absolute_error"][best_estimator_index]
        mean_train_mse = grid_result.cv_results_["mean_train_neg_mean_squared_error"][best_estimator_index]


        print(f"     Best parameters: {grid_result.best_params_}")
        print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}")
        print(f"     mean r2 train: {mean_train_r2}")

        cv_results = grid_result.cv_results_

        # calculate p-value
        scores_, pvalue_ = calculate_p_permutations(
            grid_result.best_estimator_, X_train, y_train
        )

        # insert selected info to df
        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name,
            "model": regressor[0],
            "parameters": grid_result.best_params_,
            "mean_cv_r2": mean_cv_r2,
            "std_cv_r2": std_cv_r2,
            "mean_cv_mae": mean_cv_neg_mean_absolute_error,
            "std_cv_mae": std_cv_neg_mean_absolute_error,
            "mean_cv_mse":mean_cv_neg_mean_squared_error,
            "std_cv_mse": std_cv_neg_mean_squared_error,
            "cv_results": cv_results,
            "mean_train_r2": mean_train_r2,
            "mean_train_mae":mean_train_mae,
            "mean_train_mse":mean_train_mse,
            "p-value": pvalue_,
            "best_estimator": grid_result.best_estimator_,
        }

        results_df = results_df.append(data, ignore_index=True)
    return results_df

In [41]:
# Calculating p-value with permutation test


def calculate_p_permutations(estimator, X, y, cv=3, n_permutations=100, n_jobs=10):

    score_, perm_scores_, pvalue_ = permutation_test_score(
        estimator, X, y, cv=cv, n_permutations=n_permutations, n_jobs=n_jobs
    )

    # summarize
    print(f"     The permutation P-value is = {pvalue_:.3f}")
    print(f"     The permutation score is = {score_:.3f}\n")

    return score_, pvalue_

#### Define pipelines
___

In [43]:
from rumination_experiment_transformers import *

from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA

timepoints_count = 181

In [44]:
# SPATIAL FILTER - BINS


def spatial_filter_bins_steps(spatial_filter_n_components):

    steps = [
        (
            "channels_filtering",
            ChannelExtraction(significant_channels),
        ),
        ("spatial_filter_preprocessing", SpatialFilterPreprocessing()),
        (
            "spatial_filter",
            PCA(n_components=spatial_filter_n_components, random_state=random_state),
        ),
        (
            "spatial_filter_postprocessing",
            SpatialFilterPostprocessing(
                timepoints_count=timepoints_count,
                participants_data_indices=participants_data_indices,
            ),
        ),
        ("lowpass_filter", LowpassFilter()),
        (
            "average_epochs",
            AveragePerParticipant(),
        ),
        ("binning", BinTransformer(step=step_tp)),
        ("data_channel_swap", ChannelDataSwap()),
        ("postprocessing", PostprocessingTransformer()),
        ("scaler", StandardScaler()),
        # ("feature_selection", PCA(n_components=2, random_state=random_state)),
    ]

    return steps

In [45]:
# BINS
def erp_bins_steps():
    steps = [
        (
            "channels_filtering",
            ChannelExtraction(significant_channels),
        ),
        ("lowpass_filter", LowpassFilter()),
        (
            "average_epochs",
            AveragePerParticipant(),
        ),
        ("binning", BinTransformer(step=step_tp)),
        ("data_channel_swap", ChannelDataSwap()),
        ("postprocessing", PostprocessingTransformer()),
        ("scaler", StandardScaler()),
        # ("feature_selection", PCA(random_state=random_state)),
    ]

    return steps

Generate estimator HTML representation

In [46]:
# from sklearn.utils import estimator_html_repr

# with open("my_estimator.html", "w") as f:
#     f.write(estimator_html_repr(Pipeline(this_steps)))

### Perform Experiments
___

#### Global parameters common for each experiment

In [47]:
# channels that will be included in the experiment

red_box = [
    "F1",
    "Fz",
    "F2",
    "FC1",
    "FCz",
    "FC2",
    "C1",
    "Cz",
    "C2",
    "CP1",
    "CPz",
    "CP2",
    "P1",
    "Pz",
    "P2",
]
significant_channels = [channels_dict[channel] for channel in red_box]

In [48]:
# spatial filters

spatial_filters_dict = {
    "ICA": FastICA(random_state=random_state),
    "PCA": PCA(random_state=random_state),
}

In [49]:
# bins width

step_in_ms = 50  # in miliseconds (?)
step_tp = int(signal_frequency * step_in_ms / 1000)  # in timepoints

---
#### Experiment 1

- spatial filter
- bins
- feature selection

##### Spatial filter & binning

In [50]:
# define hyperparameters of pipeline

spatial_filter = "PCA"

min_spatial_filter = 1
max_spatial_filter = 9
step_spatial_filter = 1

min_feature_selection = 1
max_feature_selection = 9
step_feature_selection = 1


regressor_params = dict(
    # spatial_filter__n_components=np.arange(
    #     min_spatial_filter, max_spatial_filter, step_spatial_filter
    # ),
    feature_selection__n_components=np.arange(
        min_feature_selection, max_feature_selection, step_feature_selection
    ),
)

regressor_steps = [("feature_selection", PCA(random_state=random_state))]

In [51]:
# define estimators and their hyperparameters

en = ("en", ElasticNet(random_state=random_state))
en_params = dict(
    # en__alpha=np.logspace(-7, 3, num=20, base=10), en__l1_ratio=np.logspace(-8, 0, num=20,base=10)
    en__alpha=np.logspace(-7, 3, num=20, base=10),
    en__l1_ratio=np.logspace(-8, 0, num=17, base=10),
)

kr = ("kr", KernelRidge(kernel="rbf"))
kr_params = dict(
    kr__alpha=np.logspace(-5, 3, num=20, base=10),
    kr__gamma=np.logspace(-5, 3, num=20, base=10),
)


svr = ("svr", SVR())
svr_params = dict(
    svr__kernel=["linear", "rbf"],
    svr__C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
    svr__gamma=["scale"],
    svr__epsilon=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
)

tested_regressors = [
    # (svr, svr_params), 
    # (kr, kr_params), 
    (en, en_params)
]

#### Run Experiment

In [None]:
from warnings import filterwarnings

filterwarnings("ignore")

In [52]:
results_static_df = pd.DataFrame()

In [53]:
# manually test different numbers of spatial filter components

for n_components in range(min_spatial_filter, max_spatial_filter, step_spatial_filter):

    pipeline_name = f"{spatial_filter}_{n_components}_bins"

    this_steps = spatial_filter_bins_steps(spatial_filter_n_components=n_components)
    pre_processing_pipeline = Pipeline(steps=this_steps)

    # perform pre-processing
    pre_processed_X = pre_processing_pipeline.fit_transform(X_train)
    
    # rate different models
    results_static_df = run_experiment(
        tested_regressors,
        regressor_params,
        pipeline_name,
        pre_processed_X,
        X_test,
        y_train,
        y_test,
        dataset_name,
        regressor_steps,
        results_static_df,
    )

EXTRACTION (108,)
IN LOWPASS FILTER
IN BUTTERWORTH FILTER SHAPE: (108,)
IN AVERAGE X SHAPE: (108,)
IN AVERAGE RETURN SHAPE: (108, 1, 181)
float64
IN BINS RETURN SHAPE: (108, 1, 14)
SWAP shape: (1, 108, 14)
POST SHAPE:(108, 14)
Rating ('en', ElasticNet(random_state=0)) 

Fitting 3 folds for each of 2720 candidates, totalling 8160 fits
     Best parameters: {'en__alpha': 0.6951927961775606, 'en__l1_ratio': 1e-08, 'feature_selection__n_components': 8}
     mean r2: 0.018827187074034857           ± 0.01
     mean r2 train: 0.09349037993319775
     The permutation P-value is = 0.050
     The permutation score is = 0.019

EXTRACTION (108,)
IN LOWPASS FILTER
IN BUTTERWORTH FILTER SHAPE: (108,)
IN AVERAGE X SHAPE: (108,)
IN AVERAGE RETURN SHAPE: (108, 2, 181)
float64
IN BINS RETURN SHAPE: (108, 2, 14)
SWAP shape: (2, 108, 14)
POST SHAPE:(108, 28)
Rating ('en', ElasticNet(random_state=0)) 

Fitting 3 folds for each of 2720 candidates, totalling 8160 fits
     Best parameters: {'en__alpha': 0.20

In [54]:
results_static_df

Unnamed: 0,best_estimator,cv_results,data_set,mean_cv_mae,mean_cv_mse,mean_cv_r2,mean_train_mae,mean_train_mse,mean_train_r2,model,p-value,parameters,pipeline_name,std_cv_mae,std_cv_mse,std_cv_r2
0,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.012073119481404623, 0.017...",error,-0.741456,-0.813407,0.018827,-0.722335,-0.753777,0.09349,en,0.049505,"{'en__alpha': 0.6951927961775606, 'en__l1_rati...",PCA_1_bins,0.044699,0.104915,0.009597
1,"(PCA(n_components=6, random_state=0), ElasticN...","{'mean_fit_time': [0.0053738753000895185, 0.03...",error,-0.70202,-0.75617,0.093554,-0.691843,-0.72468,0.132057,en,0.019802,"{'en__alpha': 0.20691380811147903, 'en__l1_rat...",PCA_2_bins,0.060528,0.135069,0.055825
2,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.0034896532694498696, 0.00...",error,-0.673311,-0.713155,0.153578,-0.666541,-0.674823,0.19286,en,0.009901,"{'en__alpha': 0.001623776739188721, 'en__l1_ra...",PCA_3_bins,0.080347,0.17489,0.108653
3,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.0048177242279052734, 0.00...",error,-0.676074,-0.727698,0.13325,-0.654202,-0.670471,0.197502,en,0.009901,"{'en__alpha': 0.01832980710832434, 'en__l1_rat...",PCA_4_bins,0.078295,0.168206,0.103002
4,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.00538333257039388, 0.0055...",error,-0.671556,-0.71392,0.145926,-0.645155,-0.670519,0.19967,en,0.009901,"{'en__alpha': 1e-07, 'en__l1_ratio': 1e-08, 'f...",PCA_5_bins,0.059293,0.135321,0.058225
5,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.00560768445332845, 0.0073...",error,-0.701762,-0.757027,0.092808,-0.658928,-0.689756,0.175099,en,0.009901,"{'en__alpha': 0.061584821106602544, 'en__l1_ra...",PCA_6_bins,0.072149,0.130227,0.03716
6,"(PCA(n_components=8, random_state=0), ElasticN...","{'mean_fit_time': [0.007640997568766276, 0.006...",error,-0.710471,-0.783725,0.063111,-0.67682,-0.710431,0.149063,en,0.019802,"{'en__alpha': 2.3357214690901213, 'en__l1_rati...",PCA_7_bins,0.077675,0.147297,0.05513
7,"(PCA(n_components=7, random_state=0), ElasticN...","{'mean_fit_time': [0.004378318786621094, 0.008...",error,-0.7264,-0.817485,0.024822,-0.689649,-0.72809,0.127411,en,0.019802,"{'en__alpha': 2.3357214690901213, 'en__l1_rati...",PCA_8_bins,0.082454,0.165957,0.07498


In [69]:
this_steps = [
        (
            "channels_filtering",
            ChannelExtraction(significant_channels),
        ),
        ("spatial_filter_preprocessing", SpatialFilterPreprocessing()),
        (
            "spatial_filter",
            PCA(n_components=3, random_state=random_state),
        )]

pre_processing_pipeline = Pipeline(steps=this_steps)

# perform pre-processing
pre_processed_pipeline = pre_processing_pipeline.fit(X_train)

EXTRACTION (108,)


In [81]:
pca_components = pre_processed_pipeline['spatial_filter'].components_

In [84]:
from numpy import save
save('pca_components_static_non_averaged.npy', pca_components)

---

In [None]:
results_mae_15_df

In [None]:
results_mae_15_df.to_pickle(
    "../data/regression_PCA_sf15_fex8_mae" + dataset_name + ".pkl"
)

In [None]:
results_R_D_df

In [None]:
results_R_D_df.to_pickle(
    "../data/regression_PCA_sf15_fex8_mae_R-D_" + dataset_name + ".pkl"
)

In [None]:
results_D_df

In [None]:
results_A_df

In [None]:
results_A_df.to_pickle(
    "../data/regression_PCA_sf15_fex8_mae_A_" + dataset_name + ".pkl"
)

In [None]:
results_D_df.to_pickle(
    "../data/regression_PCA_sf15_fex8_mae_D_" + dataset_name + ".pkl"
)

In [None]:
results_S_df

In [None]:
results_S_df.to_pickle(
    "../data/regression_PCA_sf15_fex8_r2_S_" + dataset_name + ".pkl"
)

In [None]:
results_S_df