# ERN and Pe PCA components' scores

### Imports

In [1]:
import os
import re
import glob
import os
import ast
import os.path as op
from collections import defaultdict
from copy import deepcopy
import copy

import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.base import TransformerMixin, BaseEstimator

import sys

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVR
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA

from rumination_experiment_transformers_averaged_CDS import *

import warnings

warnings.filterwarnings("ignore")


In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load pickled ERN and Pe results

In [3]:
results_ern = pd.read_pickle("../public_data/results/models_pickles/ern_results_with_external-p.pkl")
results_pe = pd.read_pickle("../public_data/results/models_pickles/pe_results_with_external-p.pkl")

## Compute ERN and Pe components scores

### 1. Extract data pre-processing pipeliens for ERn and Pe

#### Extract post-PCA pipeline for ERN

In [4]:
last_steps_pipeline_ern = results_ern.best_estimator.iloc[0]['features'].transformer_list[0][1][:-1]
last_steps_pipeline_ern

Pipeline(steps=[('ern_data_extraction', ErnTransformer()),
                ('ern_amplitude', ErnAmplitude2()),
                ('data_channel_swap', ChannelDataSwap()),
                ('postprocessing', PostprocessingTransformer())])

#### Extract post-PCA pipeline for Pe

In [5]:
last_steps_pipeline_pe = results_pe.best_estimator.iloc[0]['features'].transformer_list[0][1][:-1]
last_steps_pipeline_pe

Pipeline(steps=[('pe_data_extraction', PeTransformer(stop_pe_bin=9)),
                ('pe_amplitude', PeAmplitude2()),
                ('data_channel_swap', ChannelDataSwap()),
                ('postprocessing', PostprocessingTransformer())])

#### Extract PCA-pipeline for both ERN ROIs

In [6]:
# ROI 1
roi_1_pipeline = results_ern.pre_processed_pipeline.iloc[9]
roi_1 = roi_1_pipeline['channels_extraction'].channels_list

print(roi_1_pipeline)
print(roi_1)

Pipeline(steps=[('channels_extraction',
                 PickChannels(channels_list=['Fpz', 'AFz', 'Fz', 'FCz', 'Cz',
                                             'CPz', 'P1', 'Pz', 'P2'])),
                ('average', Evoked()), ('extract_averaged_data', ExtractData()),
                ('spatial_filter_preprocessing', SpatialFilterPreprocessing()),
                ('spatial_filter', PCA(n_components=4, random_state=0)),
                ('spatial_filter_postprocessing',
                 SpatialFilterPostprocessing(timepoints_count=181)),
                ('lowpass_filter', LowpassFilter()),
                ('binning', BinTransformer()),
                ('centering', CenteredSignalAfterBaseline3())])
['Fpz', 'AFz', 'Fz', 'FCz', 'Cz', 'CPz', 'P1', 'Pz', 'P2']


In [7]:
# ROI 2
roi_2_pipeline = results_ern.pre_processed_pipeline.iloc[15]
roi_2 = roi_2_pipeline['channels_extraction'].channels_list

print(roi_2_pipeline)
print(roi_2)

Pipeline(steps=[('channels_extraction',
                 PickChannels(channels_list=['Fpz', 'AFz', 'F1', 'Fz', 'F2',
                                             'FCz', 'C1', 'Cz', 'C2', 'CPz',
                                             'P1', 'Pz', 'P2'])),
                ('average', Evoked()), ('extract_averaged_data', ExtractData()),
                ('spatial_filter_preprocessing', SpatialFilterPreprocessing()),
                ('spatial_filter', PCA(n_components=4, random_state=0)),
                ('spatial_filter_postprocessing',
                 SpatialFilterPostprocessing(timepoints_count=181)),
                ('lowpass_filter', LowpassFilter()),
                ('binning', BinTransformer()),
                ('centering', CenteredSignalAfterBaseline3())])
['Fpz', 'AFz', 'F1', 'Fz', 'F2', 'FCz', 'C1', 'Cz', 'C2', 'CPz', 'P1', 'Pz', 'P2']


#### Extract PCA-pipeline for both Pe ROIs

In [8]:
# ROI 3
roi_3_pipeline = results_pe.pre_processed_pipeline.iloc[13]
roi_3 = roi_3_pipeline['channels_extraction'].channels_list

print(roi_3_pipeline)
print(roi_3)

Pipeline(steps=[('channels_extraction',
                 PickChannels(channels_list=['Fpz', 'AFz', 'Fz', 'FCz', 'C1',
                                             'Cz', 'C2', 'CPz', 'P1', 'Pz',
                                             'P2'])),
                ('average', Evoked()), ('extract_averaged_data', ExtractData()),
                ('spatial_filter_preprocessing', SpatialFilterPreprocessing()),
                ('spatial_filter', PCA(n_components=4, random_state=0)),
                ('spatial_filter_postprocessing',
                 SpatialFilterPostprocessing(timepoints_count=181)),
                ('lowpass_filter', LowpassFilter()),
                ('binning', BinTransformer()), ('baseline', ErnBaselined()),
                ('centering', CenteredSignalAfterBaseline3())])
['Fpz', 'AFz', 'Fz', 'FCz', 'C1', 'Cz', 'C2', 'CPz', 'P1', 'Pz', 'P2']


In [9]:
# ROI 4
roi_4_pipeline = results_pe.pre_processed_pipeline.iloc[1]
roi_4 = roi_4_pipeline['channels_extraction'].channels_list

print(roi_4_pipeline)
print(roi_4)

Pipeline(steps=[('channels_extraction',
                 PickChannels(channels_list=['Fpz', 'AFz', 'F1', 'Fz', 'F2',
                                             'FC1', 'FCz', 'FC2', 'C1', 'Cz',
                                             'C2', 'CP1', 'CPz', 'CP2', 'P1',
                                             'Pz', 'P2'])),
                ('average', Evoked()), ('extract_averaged_data', ExtractData()),
                ('spatial_filter_preprocessing', SpatialFilterPreprocessing()),
                ('spatial_filter', PCA(n_components=4, random_state=0)),
                ('spatial_filter_postprocessing',
                 SpatialFilterPostprocessing(timepoints_count=181)),
                ('lowpass_filter', LowpassFilter()),
                ('binning', BinTransformer()), ('baseline', ErnBaselined()),
                ('centering', CenteredSignalAfterBaseline3())])
['Fpz', 'AFz', 'F1', 'Fz', 'F2', 'FC1', 'FCz', 'FC2', 'C1', 'Cz', 'C2', 'CP1', 'CPz', 'CP2', 'P1', 'Pz', 'P2']


### 2. Read the data

The train data

In [37]:
df_name = "go_nogo_100_600_df_3-5_all_scales"
pickled_data_filename = "../data/responses_100_600_pickled/" + df_name + ".pkl"
info_filename = "../data/scales/all_scales.csv"

if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
    pass
else:
    print("Pickled file not found")

Pickled file found. Loading pickled data...
Done


In [38]:
X_train_df = epochs_df

The test data

In [39]:
df_name = 'go_nogo_100_600_test_df_3-5_all_scales'
pickled_data_filename = "../data/responses_100_600_pickled/" + df_name + ".pkl"
info_filename = "../data/scales/all_scales.csv"

if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
    pass
else:
    print("Pickled file not found")

Pickled file found. Loading pickled data...
Done


In [40]:
X_test_df = epochs_df

### 3. Apply pre-processing pipelines to data and compute statistics: mean and SD of scores

In [111]:
results_df = pd.DataFrame()
per_participant_results = pd.DataFrame()

for item in {"train": X_train_df, "test": X_test_df}.items():    
    dataset_name = item[0]
    dataset = item[1]
    
    print(f"---- DATASET: {dataset_name}-----------")
        
    # ROI 1
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_roi_1_mid = roi_1_pipeline.transform(X_df_copy)
    X_roi_1 = last_steps_pipeline_ern.transform(X_roi_1_mid)

    # ROI 2
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_roi_2_mid = roi_2_pipeline.transform(X_df_copy)
    X_roi_2 = last_steps_pipeline_ern.transform(X_roi_2_mid)
    
    # ROI 3
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_roi_3_mid = roi_3_pipeline.transform(X_df_copy)
    X_roi_3 = last_steps_pipeline_pe.transform(X_roi_3_mid)

    # ROI 4
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_roi_4_mid = roi_4_pipeline.transform(X_df_copy)
    X_roi_4 = last_steps_pipeline_pe.transform(X_roi_4_mid)
    

    
    print(f"---- ROI 1")

    for i in range(0,4):
        this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'ROI': 1,
            'component_num': [i+1],
            'mean': [np.mean(X_roi_1[:,i])],
            'sd': [np.std(X_roi_1[:,i])]
        })
        
        this_participant_result = pd.DataFrame({
            'dataset': [dataset_name] * len(X_roi_1[:,i].flatten()),
            'ROI': [1] * len(X_roi_1[:,i].flatten()),
            'component_num': [i+1] * len(X_roi_1[:,i].flatten()),
            'score': X_roi_1[:,i],            
        })
        per_participant_results = pd.concat([per_participant_results, this_participant_result], ignore_index=True)

        results_df = results_df.append(this_result, ignore_index=True)
        
        print(f"---- PCA {i+1}\n MEAN: {np.mean(X_roi_1[:,i])}\n SD: {np.std(X_roi_1[:,i])}\n")
        
##########################################

    print(f"---- ROI 2")

    for i in range(0,4):
        this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'ROI': 2,
            'component_num': [i+1],
            'mean': [np.mean(X_roi_2[:,i])],
            'sd': [np.std(X_roi_2[:,i])]
        })
        
        this_participant_result = pd.DataFrame({
            'dataset': [dataset_name] * len(X_roi_1[:,i].flatten()),
            'ROI': [2] * len(X_roi_1[:,i].flatten()),
            'component_num': [i+1] * len(X_roi_1[:,i].flatten()),
            'score': X_roi_2[:,i],            
        })
        
        per_participant_results = pd.concat([per_participant_results, this_participant_result], ignore_index=True)


        results_df = results_df.append(this_result, ignore_index=True)
        
        print(f"---- PCA {i+1}\n MEAN: {np.mean(X_roi_2[:,i])}\n SD: {np.std(X_roi_2[:,i])}\n")

##########################################

        
    print(f"---- ROI 3")

    for i in range(0,4):
        this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'ROI': 3,
            'component_num': [i+1],
            'mean': [np.mean(X_roi_3[:,i])],
            'sd': [np.std(X_roi_3[:,i])]
        })
        
        this_participant_result = pd.DataFrame({
            'dataset': [dataset_name] * len(X_roi_1[:,i].flatten()),
            'ROI': [3] * len(X_roi_1[:,i].flatten()),
            'component_num': [i+1] * len(X_roi_1[:,i].flatten()),
            'score': X_roi_3[:,i],            
        })
        per_participant_results = pd.concat([per_participant_results, this_participant_result], ignore_index=True)

        results_df = results_df.append(this_result, ignore_index=True)
        
        print(f"---- PCA {i+1}\n MEAN: {np.mean(X_roi_3[:,i])}\n SD: {np.std(X_roi_3[:,i])}\n")

##########################################

    print(f"---- ROI 4")

    for i in range(0,4):
        this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'ROI': 4,
            'component_num': [i+1],
            'mean': [np.mean(X_roi_4[:,i])],
            'sd': [np.std(X_roi_4[:,i])]
        })
        
        this_participant_result = pd.DataFrame({
            'dataset': [dataset_name] * len(X_roi_1[:,i].flatten()),
            'ROI': [4] * len(X_roi_1[:,i].flatten()),
            'component_num': [i+1] * len(X_roi_1[:,i].flatten()),
            'score': X_roi_4[:,i],            
        })
        per_participant_results = pd.concat([per_participant_results, this_participant_result], ignore_index=True)

        results_df = results_df.append(this_result, ignore_index=True)
        
        print(f"---- PCA {i+1}\n MEAN: {np.mean(X_roi_4[:,i])}\n SD: {np.std(X_roi_4[:,i])}\n")

---- DATASET: train-----------
(96, 9, 181)
(96, 13, 181)
(96, 11, 181)
(96, 17, 181)
---- ROI 1
---- PCA 1
 MEAN: 1.8953602928889747e-05
 SD: 1.0562940025190903e-05

---- PCA 2
 MEAN: 1.1845588662262889e-05
 SD: 6.89601125647799e-06

---- PCA 3
 MEAN: 6.042230898537439e-06
 SD: 3.115857273079911e-06

---- PCA 4
 MEAN: 2.102309718395574e-06
 SD: 1.1593810121973813e-06

---- ROI 2
---- PCA 1
 MEAN: 2.4324090350905986e-05
 SD: 1.3607232178584808e-05

---- PCA 2
 MEAN: 1.2994118919941206e-05
 SD: 7.699727379038759e-06

---- PCA 3
 MEAN: 6.090104292145985e-06
 SD: 3.269377585271518e-06

---- PCA 4
 MEAN: 2.4620134524185194e-06
 SD: 1.400189878581368e-06

---- ROI 3
---- PCA 1
 MEAN: 4.428561129222968e-05
 SD: 1.909256875971781e-05

---- PCA 2
 MEAN: 1.1260718840475782e-05
 SD: 5.460446614857752e-06

---- PCA 3
 MEAN: 6.568368335157186e-06
 SD: 3.296254396448207e-06

---- PCA 4
 MEAN: 2.7183267475472265e-06
 SD: 1.5557452591412552e-06

---- ROI 4
---- PCA 1
 MEAN: 5.625392113278409e-05
 SD:

In [112]:
ERP = pd.DataFrame(np.array([['ERN'] * 8 + ['Pe'] * 8 + ['ERN'] * 8 + ['Pe'] * 8]).flatten(), columns=['erp'])
results_df = pd.concat([results_df, ERP], axis=1)

In [113]:
results_df.head()

Unnamed: 0,dataset,ROI,component_num,mean,sd,erp
0,train,1,1,1.9e-05,1.1e-05,ERN
1,train,1,2,1.2e-05,7e-06,ERN
2,train,1,3,6e-06,3e-06,ERN
3,train,1,4,2e-06,1e-06,ERN
4,train,2,1,2.4e-05,1.4e-05,ERN


In [None]:
results_df.set_index(['dataset', 'ROI', 'component_num'])

In [128]:
results_df.to_csv('../public_data/results/components_scores/scores_per_component_stats.csv')

In [44]:
per_participant_results_PCA = per_participant_results.copy()
per_participant_results_PCA.head()

Unnamed: 0,dataset,ROI,component_num,score
0,train,1,1,1.3e-05
1,train,1,1,1.7e-05
2,train,1,1,3.7e-05
3,train,1,1,7e-06
4,train,1,1,1.4e-05


## ERN/Pe ERP analyses

### Compute ERN and Pe components scores

In [45]:
results_ern = pd.read_pickle("../public_data/results/models_pickles/ern_waves_analysis_train_cv_lin.pkl")
results_pe = pd.read_pickle("../public_data/results/models_pickles/pe_waves_analysis_train_cv_lin.pkl")

#### 1. Extract data pre-processing pipeliens for ERN and Pe

In [46]:
ern_pipeline = results_ern.pre_processed_pipeline[0]
ern_features_pipeline = results_ern.best_estimator[0][:-1]

pe_pipeline = results_pe.pre_processed_pipeline[0]
pe_features_pipeline = results_pe.best_estimator[0][:-1]

In [47]:
pe_pipeline

Pipeline(steps=[('channels_extraction', PickChannels(channels_list=['Cz'])),
                ('trim', EpochTrim(tmax=0.35, tmin=0.15)),
                ('average', Evoked()),
                ('extract_averaged_data', ExtractData())])

#### 2. Read the data

The train data

In [48]:
df_name = "go_nogo_100_600_df_3-5_all_scales"
pickled_data_filename = "../data/responses_100_600_pickled/" + df_name + ".pkl"
info_filename = "../data/scales/all_scales.csv"

if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
    pass
else:
    print("Pickled file not found")

Pickled file found. Loading pickled data...
Done


In [49]:
X_train_df = epochs_df

The test data

In [50]:
df_name = 'go_nogo_100_600_test_df_3-5_all_scales'
pickled_data_filename = "../data/responses_100_600_pickled/" + df_name + ".pkl"
info_filename = "../data/scales/all_scales.csv"

if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
    pass
else:
    print("Pickled file not found")

Pickled file found. Loading pickled data...
Done


In [51]:
X_test_df = epochs_df

#### 3. Apply piplines to data and compute statistics

In [52]:
results_df = pd.DataFrame()
per_participant_results = pd.DataFrame()

for item in {"train": X_train_df, "test": X_test_df}.items():
    participant_dataset_df = pd.DataFrame()
    
    dataset_name = item[0]
    dataset = item[1]
            
    # ERN
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_ern_mid = ern_pipeline.transform(X_df_copy)
    X_ern = ern_features_pipeline.transform(X_ern_mid)
        
    this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'component': ["ERN"],
            'mean': [np.mean(X_ern)],
            'sd': [np.std(X_ern)]
    }) 

    results_df = results_df.append(this_result)
        
    # Pe
    X_df_copy = pd.DataFrame(copy.deepcopy(dataset.to_dict()))

    X_pe_mid = pe_pipeline.transform(X_df_copy)
    X_pe = pe_features_pipeline.transform(X_pe_mid)
    
    this_result = pd.DataFrame({
            'dataset': [dataset_name],
            'component': ["Pe"],
            'mean': [np.mean(X_pe)],
            'sd': [np.std(X_pe)]
    }) 
    
    results_df = results_df.append(this_result, ignore_index=True)
    
    participant_dataset_df["ERN"] = X_ern.flatten()
    participant_dataset_df["Pe"] = X_pe.flatten()
    participant_dataset_df["dataset"] = [dataset_name] * len(X_ern.flatten())

    per_participant_results = per_participant_results.append(participant_dataset_df, ignore_index=True)

IN PE RETURN SHAPE: (206, 1, 27)
(96, 1, 27)
IN PE RETURN SHAPE: (206, 1, 53)
(96, 1, 53)
IN PE RETURN SHAPE: (178, 1, 27)
(34, 1, 27)
IN PE RETURN SHAPE: (178, 1, 53)
(34, 1, 53)


In [53]:
results_df

Unnamed: 0,dataset,component,mean,sd
0,train,ERN,-5e-06,5e-06
1,train,Pe,1.5e-05,7e-06
2,test,ERN,-4e-06,4e-06
3,test,Pe,1.4e-05,8e-06


In [73]:
results_df.to_csv('../public_data/results/components_scores/scores_wave_analysis_stats.csv')

In [55]:
per_participant_results_ERP = per_participant_results.copy()
per_participant_results_ERP.head()

Unnamed: 0,ERN,Pe,dataset
0,-4.188265e-06,2.7e-05,train
1,-3.915253e-06,1.2e-05,train
2,-1.794473e-05,5e-06,train
3,3.809618e-07,7e-06,train
4,-2.832309e-06,8e-06,train


In [75]:
per_participant_results.to_csv('../public_data/results/components_scores/scores_wave_analysis_per_participant.csv')

### Test differences between train and test PCA-based and ERP-based scores

- PCA-based scores

In [119]:
per_participant_results_PCA.head()

Unnamed: 0,dataset,ROI,component_num,score
0,train,1,1,1.3e-05
1,train,1,1,1.7e-05
2,train,1,1,3.7e-05
3,train,1,1,7e-06
4,train,1,1,1.4e-05


In [120]:
grouped_results = per_participant_results_PCA.groupby(by=["dataset", 'ROI', 'component_num'])
PCA_scores_differences_results = pd.DataFrame()

for roi in range(1,5):
    for component_num in range(1,5):
        X_train = grouped_results.get_group(('train', roi, component_num))['score'].to_numpy()
        X_test = grouped_results.get_group(('test', roi, component_num))['score'].to_numpy()
        
        t_stat, p_val = scipy.stats.ttest_ind(X_train, X_test, permutations=1000)
            
        this_result = pd.DataFrame({
            'ROI': [roi],
            'component': [component_num],
            't_stats': [t_stat],
            'p_value': [p_val],
        }) 
        
        PCA_scores_differences_results = pd.concat([PCA_scores_differences_results, this_result], ignore_index=True)        

In [121]:
PCA_scores_differences_results = PCA_scores_differences_results.set_index(['ROI', 'component'])
PCA_scores_differences_results

Unnamed: 0_level_0,Unnamed: 1_level_0,t_stats,p_value
ROI,component,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.287685,0.77
1,2,2.296702,0.02
1,3,0.133513,0.881
1,4,1.111342,0.283
2,1,0.395409,0.663
2,2,2.204175,0.031
2,3,-0.066567,0.946
2,4,1.541497,0.119
3,1,0.891862,0.386
3,2,1.437118,0.152


In [85]:
PCA_scores_differences_results.to_csv('../public_data/results/components_scores/train_test_PCA_statistical_tests_results.csv')

- ERP-based scores

In [90]:
per_participant_results_ERP.head()

Unnamed: 0,ERN,Pe,dataset
0,-4.188265e-06,2.7e-05,train
1,-3.915253e-06,1.2e-05,train
2,-1.794473e-05,5e-06,train
3,3.809618e-07,7e-06,train
4,-2.832309e-06,8e-06,train


In [91]:
grouped_results = per_participant_results_ERP.groupby(by=["dataset"])
ERP_scores_differences_results = pd.DataFrame()

for component in ['ERN', 'Pe']:
        X_train = grouped_results.get_group('train')[component].to_numpy()
        X_test = grouped_results.get_group('test')[component].to_numpy()
        
        t_stat, p_val = scipy.stats.ttest_ind(X_train, X_test, permutations=1000)
            
        this_result = pd.DataFrame({
            'component': [component],
            't_stats': [t_stat],
            'p_value': [p_val],
        }) 
        
        ERP_scores_differences_results = pd.concat([ERP_scores_differences_results, this_result], ignore_index=True)        

In [92]:
ERP_scores_differences_results = ERP_scores_differences_results.set_index(['component'])
ERP_scores_differences_results

Unnamed: 0_level_0,t_stats,p_value
component,Unnamed: 1_level_1,Unnamed: 2_level_1
ERN,-0.553042,0.595
Pe,0.980282,0.337


In [93]:
ERP_scores_differences_results.to_csv('../public_data/results/components_scores/train_test_ERP_statistical_tests_results.csv')