# Behavioral data analysis

### Imports

In [None]:
import os
import sys
import re
import glob
import ast
import os.path as op
import pickle
import mne
import scipy
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import scipy as sp

from collections import defaultdict

from copy import deepcopy
import copy

import pygsp

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.utils import resample
from sklearn.covariance import GraphicalLassoCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression
from scipy.interpolate import UnivariateSpline
from scipy.integrate import simpson
from scipy.stats import boxcox
from sklearn.metrics import auc


from scipy.stats import pearsonr

from autoreject import AutoReject

from transformers import *

import warnings
warnings.filterwarnings("ignore")

---
## Load data

Loading EEG data and data from questionnaires. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
# paths TODO
dir_path = os.path.dirname(os.path.abspath(""))

In [None]:
tmin, tmax = -0.1, 0.6  # Start and end of the segments
signal_frequency = 256
random_state = 0

In [None]:
def create_df_data(
    test_participants=False,
    test_epochs=False,
    info_filename=None,
    info="all",
    personal=True,
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.

    On default, loads a train set: chooses only 80% of participants
    and for each of them chooses 80% of epochs.
    It will choose them deterministically.

    Participants with less than 10 epochs per condition are rejected.

    If test_participants is set to True, it will load remaining 20% of participants.
    If test_epochs is set to True, it will load remaining 20% of epochs.
    Test epochs are chronologically after train epochs,
    because it reflects real usage (first callibration and then classification).

    Parameters
    ----------
    test_participants: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    test_epochs: bool
        whether load data for training or final testing.
        If true load epochs of each participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters
    personal: bool
        whether a model will be both trained and tested on epochs from one person
        if false, person's epochs aren't split into test and train


    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    print(os.path.abspath(""))
    dir_path = os.path.dirname(os.path.abspath(""))
    print(dir_path)
    header_files_glob = os.path.join(dir_path, "data/responses_100_600_sonata/*.vhdr")
    header_files = glob.glob(header_files_glob)

    header_files = sorted(header_files)
    go_nogo_data_df = pd.DataFrame()

    # cut 20% of data for testing
    h_train, h_test = train_test_split(header_files, test_size=0.3, random_state=0)
    
    print(f"train size: {len(h_train)} ; test size: {len(h_test)}")

    if test_participants:
        header_files = h_test
    else:
        header_files = h_train

    for file in header_files:
        #  load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*GNG-(\d+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        # exclude those participants who have too few samples
        if len(error) < 5 or len(correct) < 5:
            # not enough data for this participant
            continue

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, participant_epochs, info_filename, info
        )
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [None]:
def create_df_from_epochs(id, participant_epochs, info_filename, info):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.
    Default info extracted form .csv file is 'Rumination Full Scale' and participants' ids.
    With this info df structure is like:
    {id: String ; epoch: epoch_data ; marker: 1.0|0.0 ; File: id ; 'Rumination Full Scale': int}

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    correct: array
        correct responses' data
    error: array
        error responses' data
    info_filename: String
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.
        if 'all', load all parameters

    Returns
    -------
    participant_df : pandas.DataFrame

    """
    participant_df = pd.DataFrame()
    info_df = pd.DataFrame()

    # get additional info from file
    if info_filename is not None:
        if info == "all":
            rumination_df = pd.read_csv(info_filename, dtype={'Demo_kod': object})
        else:
            rumination_df = pd.read_csv(info_filename, usecols=["Demo_kod"] + info, dtype={'Demo_kod': object})
        info_df = (
            rumination_df.loc[rumination_df["Demo_kod"] == id]
            .reset_index()
            .drop("index", axis=1)
        )      
    epoch_df = pd.DataFrame({"id": [id], "epoch": [participant_epochs]}).join(
            info_df
        )
    participant_df = participant_df.append(epoch_df, ignore_index=True)

    return participant_df

In [None]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' means that bad segments are rejected automatically.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision(file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations(annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    # event_dict = {
    #     "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
    #     "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
    #     "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
    #     "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
    #     "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
    #     "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
    #     "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
    #     "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    # }
    
    event_dict = {
        'Stimulus/RE*ex*1_n*1_c_1*R*FB': 10003,
        'Stimulus/RE*ex*1_n*1_c_1*R*FG': 10004,
        'Stimulus/RE*ex*1_n*1_c_2*R': 10005,
        'Stimulus/RE*ex*1_n*2_c_1*R': 10006,
        'Stimulus/RE*ex*2_n*1_c_1*R': 10007,
        'Stimulus/RE*ex*2_n*2_c_1*R*FB': 10008,
        'Stimulus/RE*ex*2_n*2_c_1*R*FG': 10009,
        'Stimulus/RE*ex*2_n*2_c_2*R': 10010,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10003, 10004, 10008, 10009],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10005, 10006, 10007, 10010],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    epochs = []
    bads = []
    this_reject_by_annotation = False
    
    # Read epochs
    epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
    )
    
    ar = AutoReject(random_state=random_state, n_jobs=10, verbose=0)
    epochs_ar, reject_log = ar.fit_transform(epochs, return_log=True)
    
    return epochs_ar

#### Read the data

In [None]:
def change_column_names(data_df):
    columns_dict = {
        "Demo_kod": "ID",
        "16-Rumination Full Scale": "RRQ", # mean
        "05-DASS-21 Anxiety scale": "DASS-21 Anx", # mean
        ###
        "05-DASS-21 Stress scale": "DASS-21 Stress", # mean
        "05-DASS-21 Depression scale": "DASS-21 Dep", # mean
        "04-STAI Trait MEAN": "STAI-T_M", # mean
        "04-STAI STATE 1-2 DIFFERENCE": "STAI-S Diff", # sum
        "number_error" : "uninhibited response", # sum
        "number_inhibited" : "inhibited response", # sum
        ###
        "04-STAI Trait SUM": "STAI-T", # sum
        "07-BIS": "BIS", # mean
        "14-Obsessive-Compulsive WASHING": "WASH", # mean
        "14-Obsessive-Compulsive OBSESSING": "OBSESS", # mean
        "14-Obsessive-Compulsive HOARDING": "HOARD", # mean
        "14-Obsessive-Compulsive ORDERING": "ORD", # mean
        "14-Obsessive-Compulsive CHECKING": "CHECK", # mean
        "14-Obsessive-Compulsive NEUTRALIZING": "NEU", # mean
        # "14-Obsessive-Compulsive FULL": "OCI-R",
        "18-Thought Suppression Inventory": "WBSI", # mean
        "28-Intolerance of Uncertainty - Prospective Anxiety": "IUS-P", # mean
        "28-Intolerance of Uncertainty - Inhibitory Anxiety": "IUS-I", # mean
        "06-Self-Esteem Scale_SES Rosenberga": "SES", # mean
        "07-BAS Dzialanie": 'BAS_D', # mean
        "07-BAS Poszukiwanie przyjemnosci": 'BAS_PRZY', # mean
        "07-BAS Wrazliwosc na nagrode": 'BAS_NAG', # mean
        "22-Nonforgiveness - Full Scale": 'NONFOR',
        "27-Indecisiveness Scale_Frost": 'INDEC_F', # mean
        "03-SP (Punishment Sensitivity)": 'PUN', # sum
        "03-SR (Reward Sensitivity)": 'REW', # sum
        "15-Obsessional Beliefs - Inflated responsibility for harm": 'HARM', # mean
        "15-Obsessional Beliefs - Importance/Control of Thoughts": 'T-CTR', # mean
        "15-Obsessional Beliefs - Overestimation of threat": "OT", # mean
        "15-Obsessional Beliefs - Perfectionism/ Intolerance of uncertainty": 'OB_PERF', # mean
        "17-Perfectionism CMDA": 'CMDA', # mean
        "17-Perfect PS-Personal Standards (7 items mean)" : 'PS', # mean
        "19-Guilt sensitivity": 'G_SE', # mean
        "31-NFC Nietolerancja wieloznaczności-FULL": 'AMB', # mean
        "31-NFC Preferowanie przewidywalności-FULL": 'PRED', # mean
        "32-High standards from Maximization Full Scale" : 'STAND',   # mean
        "Wiek": "Age",
        "Płeć": "Sex",
        "Ręczność": "Handness",    

        #######
        "Rumination Full Scale": "RRQ",
        "DASS-21 Anxiety scale 0-SUM": "DASS-21 Anx", # sum
        "DASS-21 Stress scale 0-SUM": "DASS-21 Stress", # sum
        "DASS-21 Depression scale 0-SUM": "DASS-21 Dep", # sum
        "number_error": "uninhibited response", # sum
        "number_inhibited":  "inhibited response", # sum
        "STAI STATE 1-2 DIFFERENCE": "STAI-S Diff", # sum
        ###
        "STAI Trait SUM": "STAI-T", # sum
        "BIS": "BIS", # mean
        "Obsessive-Compulsive WASHING": "WASH", # mean
        "Obsessive-Compulsive OBSESSING": "OBSESS", # mean
        "Obsessive-Compulsive HOARDING": "HOARD", # mean
        "Obsessive-Compulsive ORDERING": "ORD", # mean
        "Obsessive-Compulsive CHECKING": "CHECK", # mean
        "Obsessive-Compulsive NEUTRALIZING": "NEU", # mean
        # "14-Obsessive-Compulsive FULL": "OCI-R",
        "Thought Suppression Inventory": "WBSI", # mean
        "Intolerance of Uncertainty - Prospective Anxiety": "IUS-P", # mean
        "Intolerance of Uncertainty - Inhibitory Anxiety": "IUS-I", # mean
        "Self-Esteem Scale_SES Rosenberga MEAN": "SES", # mean
        "BAS Dzialanie": 'BAS_D', # mean # drive
        "BAS Poszukiwanie przyjemnosci": 'BAS_PRZY', # mean # fun seeking
        "BAS Wrazliwosc na nagrode": 'BAS_NAG', # mean # responsivness
        # "22-Nonforgiveness - Full Scale": 'NONFOR',
        "Indecisiveness Scale_Frost": 'INDEC_F', # mean
        "SP (Punishment Sensitivity)": 'PUN', # sum
        "SR (Reward Sensitivity)": 'REW', # sum
        "Obsessional Beliefs - Inflated responsibility for harm": 'HARM', # mean
        "Obsessional Beliefs - Importance/Control of Thoughts": 'T-CTR', # mean
        "Obsessional Beliefs - Overestimation of threat": "OT", # mean
        "Obsessional Beliefs - Perfectionism/ Intolerance of uncertainty": 'OB_PERF', # mean
        # "17-Perfectionism CMDA": 'CMDA',
        "Perfect PS-Personal Standards (7 items mean)" : 'PS', # mean
        "Guilt sensitivity": 'G_SE', # mean
        "Nietolerancja wieloznaczności-FULL": 'AMB', # mean
        "Preferowanie przewidywalności-FULL": 'PRED', # mean
        "High standards from Maximization Full Scale" : 'STAND',   # mean
    }

    data_df = data_df.rename(columns=columns_dict)
    return data_df

- read Study 1 train data

In [None]:
# %%capture
df_name = "GNG_autoreject_3_5"
# df_name = "GNG_reject_auto_3-5"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/scales/Sonata_scales.csv"


# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_train_opus_df = pd.read_pickle(pickled_data_filename)
    epochs_train_opus_df.name = df_name
    epochs_train_opus_df = change_column_names(epochs_train_opus_df)
    print("Done")
    pass
else:
    print("Pickled file not found. Loading data...")
    epochs_train_opus_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_train_opus_df.name = df_name
    epochs_train_opus_df = change_column_names(epochs_train_opus_df)
    # save loaded data into a pickle file
    epochs_train_opus_df.to_pickle("../data/" + epochs_train_opus_df.name + ".pkl")
    print("Done. Pickle file created")

In [None]:
epochs_train_opus_df.shape

In [None]:
epochs_train_opus_df['STAI-T'] = epochs_train_opus_df['STAI-T'].apply(lambda x: x/20)

- read Study 2 train data

In [None]:
# %%capture
df_name = "GNG_autoreject_sonata_3_5_stai"
# df_name = "GNG_reject_auto_3-5"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/scales/Sonata_scales.csv"


# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_train_sonata_df = pd.read_pickle(pickled_data_filename)
    epochs_train_sonata_df.name = df_name
    epochs_train_sonata_df = change_column_names(epochs_train_sonata_df)
    print("Done")
    pass
else:
    print("Pickled file not found. Loading data...")
    epochs_train_sonata_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_train_sonata_df.name = df_name
    epochs_train_sonata_df = change_column_names(epochs_train_sonata_df)
    # save loaded data into a pickle file
    epochs_train_sonata_df.to_pickle("../data/" + epochs_train_sonata_df.name + ".pkl")
    print("Done. Pickle file created")

In [None]:
epochs_train_sonata_df.shape

Refine some data from questionnaries to reconcile data from two datasets

In [None]:
epochs_train_sonata_df.loc[epochs_train_sonata_df['Sex'] == 'Osoba niebinarna', 'Sex'] = 0.5

In [None]:
# normalize DASS-21 subscales scores to mean
epochs_train_sonata_df['DASS-21 Stress'] = epochs_train_sonata_df['DASS-21 Stress'].apply(lambda x: (x + 7)/7)
epochs_train_sonata_df['DASS-21 Anx'] = epochs_train_sonata_df['DASS-21 Anx'].apply(lambda x: (x + 7)/7)
epochs_train_sonata_df['DASS-21 Dep'] = epochs_train_sonata_df['DASS-21 Dep'].apply(lambda x: (x + 7)/7)

In [None]:
# normalize STAI state scale to mean (divide by amount of items in subscale)
epochs_train_sonata_df['STAI-T'] = epochs_train_sonata_df['STAI-T'].apply(lambda x: x/20)

In [None]:
# fill missing value in STAI-S Diff score
epochs_train_sonata_df.loc[epochs_train_sonata_df['STAI-S Diff'] == 'None', 'STAI-S Diff'] = None
epochs_train_sonata_df['STAI-S Diff'] = np.array(epochs_train_sonata_df['STAI-S Diff'].to_list()).astype(np.float) 

In [None]:
epochs_train_sonata_df['STAI-S Diff'] = epochs_train_sonata_df['STAI-S Diff'].fillna(epochs_train_sonata_df['STAI-S Diff'].mean())

#### Read data for external testing

- read Study 1 test

In [None]:
# %%capture
df_name = "GNG_autoreject_3_5_test_performance"
pickled_data_filename = "../data/" + df_name + ".pkl"
# info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"
# info_filename = "../data/scales/all_scales.csv"
info_filename = "../data/scales/Sonata_scales.csv"


# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_test_opus_df = pd.read_pickle(pickled_data_filename)
    epochs_test_opus_df.name = df_name
    epochs_test_opus_df = change_column_names(epochs_test_opus_df)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_test_opus_df = create_df_data(
        test_participants=True, info="all", personal=False, info_filename=info_filename
    )
    epochs_test_opus_df.name = df_name
    epochs_test_opus_df = change_column_names(epochs_test_opus_df)
    # save loaded data into a pickle file
    epochs_test_opus_df.to_pickle("../data/" + epochs_test_opus_df.name + ".pkl")
    print("Done. Pickle file created")

In [None]:
epochs_test_opus_df.shape

In [None]:
epochs_test_opus_df['STAI-T'] = epochs_test_opus_df['STAI-T'].apply(lambda x: x/20)

- read Study 2 test

In [None]:
# %%capture
df_name = "GNG_autoreject_sonata_3_5_test_stai"
pickled_data_filename = "../data/" + df_name + ".pkl"
# info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"
# info_filename = "../data/scales/all_scales.csv"
info_filename = "../data/scales/Sonata_scales.csv"


# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_test_sonata_df = pd.read_pickle(pickled_data_filename)
    epochs_test_sonata_df.name = df_name
    epochs_test_sonata_df = change_column_names(epochs_test_sonata_df)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_test_sonata_df = create_df_data(
        test_participants=True, info="all", personal=False, info_filename=info_filename
    )
    epochs_test_sonata_df.name = df_name
    epochs_test_sonata_df = change_column_names(epochs_test_sonata_df)
    # save loaded data into a pickle file
    epochs_test_sonata_df.to_pickle("../data/" + epochs_test_sonata_df.name + ".pkl")
    print("Done. Pickle file created")

In [None]:
epochs_test_sonata_df.shape

Refine some data from questionnaries to reconcile data from two datasets

In [None]:
epochs_test_sonata_df.loc[epochs_test_sonata_df['Sex'] == 'Osoba niebinarna', 'Sex'] = 0.5

In [None]:
# normalize DASS-21 subscales scores to mean
epochs_test_sonata_df['DASS-21 Stress'] = epochs_test_sonata_df['DASS-21 Stress'].apply(lambda x: (x + 7)/7)
epochs_test_sonata_df['DASS-21 Anx'] = epochs_test_sonata_df['DASS-21 Anx'].apply(lambda x: (x + 7)/7)
epochs_test_sonata_df['DASS-21 Dep'] = epochs_test_sonata_df['DASS-21 Dep'].apply(lambda x: (x + 7)/7)

In [None]:
# normalize STAI state scale to mean (divide by amount of items in subscale)
epochs_test_sonata_df['STAI-T'] = epochs_test_sonata_df['STAI-T'].apply(lambda x: x/20)

In [None]:
# fill missing value in STAI-S Diff score
epochs_test_sonata_df.loc[epochs_test_sonata_df['STAI-S Diff'] == 'None', 'STAI-S Diff'] = None
epochs_test_sonata_df['STAI-S Diff'] = np.array(epochs_test_sonata_df['STAI-S Diff'].to_list()).astype(np.float) 

In [None]:
epochs_test_sonata_df['STAI-S Diff'] = epochs_test_sonata_df['STAI-S Diff'].fillna(epochs_test_sonata_df['STAI-S Diff'].mean())

## Merge dataframes

In [None]:
opus_columns_list = epochs_train_opus_df.columns.to_list()
sonata_columns_list = epochs_train_sonata_df.columns.to_list()

columns = list(set(opus_columns_list) & set(sonata_columns_list))

In [None]:
epochs_train_sonata_df = epochs_train_sonata_df[columns]
epochs_train_opus_df = epochs_train_opus_df[columns]

epochs_test_sonata_df = epochs_test_sonata_df[columns]
epochs_test_opus_df = epochs_test_opus_df[columns]

Create train and test data

In [None]:
epochs_train_df = pd.concat([epochs_train_sonata_df, epochs_train_opus_df], ignore_index=True)
epochs_test_df = pd.concat([epochs_test_sonata_df, epochs_test_opus_df], ignore_index=True)

## Basic sample info

In [None]:
data_df = pd.concat([epochs_train_df, epochs_test_df], ignore_index=True)
data_df["Sex"] = pd.to_numeric(data_df["Sex"])
data_df.describe()

In [None]:
data_df["Sex"].value_counts()

## Behavioral data

In [None]:
stats_opus_df = pd.read_csv("../data/behavioral/stats_opus.csv", dtype={'ID': object})
stats_sonata_df = pd.read_csv("../data/behavioral/stats_sonata.csv", dtype={'ID': object})

stats_df = pd.concat([stats_opus_df, stats_sonata_df], ignore_index=True)

In [None]:
ID_train_list = epochs_train_df.id.to_list()
ID_test_list = epochs_test_df.id.to_list()

In [None]:
train_df = stats_df[stats_df['ID'].isin(ID_train_list)]
test_df = stats_df[stats_df['ID'].isin(ID_test_list)]

### The train dataset stats

Extract RT of correct and error trials

In [None]:
error_rt = train_df["mean_error_RT"].to_numpy()
correct_rt = train_df["mean_hit_RT"].to_numpy()
error_trials_num = train_df["number_error"].to_numpy()
correct_trials_num = train_df["number_fast_hit"].to_numpy() + train_df["number_slow_hit"].to_numpy()

In [None]:
print(f"X train average number of:\n erroneous responses: {error_trials_num.mean()} SD = {error_trials_num.std()}\n correct responses: {correct_trials_num.mean()} SD = {correct_trials_num.std()}")

In [None]:
print(f"X train average RT for:\n erroneous responses: {error_rt.mean()} SD = {error_rt.std()}\n correct responses: {correct_rt.mean()} SD = {correct_rt.std()}")

In [None]:
t_value, p_value = scipy.stats.ttest_rel(error_rt, correct_rt)
print(f"t({train_df.shape[0] -1}) = {t_value}, p = {p_value}")

Average number of trials included in the analysis

In [None]:
error_len = epochs_train_df['epoch'].map(lambda x: len(x['error_response'].get_data())).to_numpy()
correct_len = epochs_train_df['epoch'].map(lambda x: len(x['correct_response'].get_data())).to_numpy()

In [None]:
print(f"AVG number of incorrect responses in the train set: {np.mean(error_len)} (SD={np.std(error_len)})")
print(f"AVG number of correct responses in the train set: {np.mean(correct_len)} (SD={np.std(correct_len)})")

Average performnce

In [None]:
train_df['performance'] = train_df['number_inhibited'] / (train_df['number_inhibited'] + train_df['number_error'])
performance = train_df['performance'].to_numpy()

In [None]:
print(f"AVG performance in the test set: {np.mean(performance)} (SD={np.std(performance)})")

### The test dataset stats

Extract RT of correct and error trials

In [None]:
error_rt = test_df["mean_error_RT"].to_numpy()
correct_rt = test_df["mean_hit_RT"].to_numpy()
error_trials_num = test_df["number_error"].to_numpy()
correct_trials_num = test_df["number_fast_hit"].to_numpy() + test_df["number_slow_hit"].to_numpy()

In [None]:
print(f"X test average number of:\n erroneous responses: {error_trials_num.mean()} SD = {error_trials_num.std()}\n correct responses: {correct_trials_num.mean()} SD = {correct_trials_num.std()}")

In [None]:
print(f"X test average RT for:\n erroneous responses: {error_rt.mean()} SD = {error_rt.std()}\n correct responses: {correct_rt.mean()} SD = {correct_rt.std()}")

In [None]:
scipy.stats.ttest_rel(error_rt, correct_rt)

Average number of trials included in the analusis

In [None]:
error_len = epochs_test_df['epoch'].map(lambda x: len(x['error_response'].get_data())).to_numpy()
correct_len = epochs_test_df['epoch'].map(lambda x: len(x['correct_response'].get_data())).to_numpy()

In [None]:
print(f"AVG number of incorrect responses in the test set: {np.mean(error_len)} (SD={np.std(error_len)})")
print(f"AVG number of correct responses in the test set: {np.mean(correct_len)} (SD={np.std(correct_len)})")

Average performnce

In [None]:
test_df['performance'] = test_df['number_inhibited'] / (test_df['number_inhibited'] + test_df['number_error'])
performance = test_df['performance'].to_numpy()

In [None]:
print(f"AVG performance in the test set: {np.mean(performance)} (SD={np.std(performance)})")

## EEG features

### Amplitude

In [None]:
roi = [
    "Fz", 
]

In [None]:
ern_pipeline = Pipeline([
    ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=0, tmax=0.1)),
    ("average", Evoked()),
    ('extract_averaged_data', ExtractData()),
    ("mean_amplitude", AverageSignal()),
    
])

crn_pipeline = Pipeline([
   ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=0, tmax=0.1)),
    ("average", Evoked(condition='correct_response')),
    ('extract_averaged_data', ExtractData()),
    ("mean_amplitude", AverageSignal()),
    
])

- training set

Difference in the 0-100 ms window between conditions

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_train_df.to_dict()))

preprocessed_X_ern_train = ern_pipeline.fit_transform(epochs_df_copy)

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_train_df.to_dict()))

preprocessed_X_crn_train = crn_pipeline.fit_transform(epochs_df_copy)

In [None]:
print(f"AVG ERN amplitude in the test set: {np.mean(preprocessed_X_ern_train*1000000)} (SD={np.std(preprocessed_X_ern_train*1000000)})")
print(f"AVG CRN amplitude in the test set: {np.mean(preprocessed_X_crn_train*1000000)} (SD={np.std(preprocessed_X_crn_train*1000000)})")

In [None]:
scipy.stats.ttest_rel(preprocessed_X_ern_train, preprocessed_X_crn_train)

- testing set

Difference in the 0-100 ms window between conditions

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_test_df.to_dict()))

preprocessed_X_ern_test = ern_pipeline.fit_transform(epochs_df_copy)

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_test_df.to_dict()))

preprocessed_X_crn_test = crn_pipeline.fit_transform(epochs_df_copy)

In [None]:
print(f"AVG ERN amplitude in the test set: {np.mean(preprocessed_X_ern_test*1000000)} (SD={np.std(preprocessed_X_ern_test*1000000)})")
print(f"AVG CRN amplitude in the test set: {np.mean(preprocessed_X_crn_test*1000000)} (SD={np.std(preprocessed_X_crn_test*1000000)})")

In [None]:
scipy.stats.ttest_rel(preprocessed_X_ern_test, preprocessed_X_crn_test)

#### Difference between training and testing sets in amplitudes of ERN and CRN

In [None]:
scipy.stats.ttest_ind(preprocessed_X_ern_train, preprocessed_X_ern_test)

In [None]:
scipy.stats.ttest_ind(preprocessed_X_crn_train, preprocessed_X_crn_test)

## Questionnaries, Covariates, and ERPs descriptive stats

In [None]:
test = True
dataset = 'test' if test else 'train'
epochs_df = epochs_test_df if test else epochs_train_df

ERN

- ROI: Fz
- time window: 0 - 100 ms
- mean amplitude

In [None]:
roi = [
    "Fz", 
]

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_df.to_dict()))

ern_pipeline = Pipeline([
    ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=0, tmax=0.1)),
    ("average", Evoked()),
    ('extract_averaged_data', ExtractData()),
    ("mean_amplitude", AverageSignal()),
    
]).fit(epochs_df_copy)

preprocessed_X_ern = ern_pipeline.transform(epochs_df_copy)
preprocessed_X_ern = preprocessed_X_ern.reshape(preprocessed_X_ern.shape[0], -1)

In [None]:
preprocessed_X_ern.shape

CRN

- ROI: Fz
- time window: 0 - 100 ms
- mean amplitude

In [None]:
roi = [
    "Fz", 
]

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_df.to_dict()))

crn_pipeline = Pipeline([
   ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=0, tmax=0.1)),
    ("average", Evoked(condition='correct_response')),
    ('extract_averaged_data', ExtractData()),
    ("mean_amplitude", AverageSignal()),
    
]).fit(epochs_df_copy)

preprocessed_X_crn = crn_pipeline.transform(epochs_df_copy)
preprocessed_X_crn = preprocessed_X_crn.reshape(preprocessed_X_crn.shape[0], -1)

In [None]:
preprocessed_X_crn.shape

Fractional area latency - ERN

In [None]:
def fractional_negative_area_latency(evoked, fraction=0.5, tmin=0.0, tmax=0.5, threshold = 0.0):
    subject_data = evoked.get_data()
    x = np.linspace(tmin, tmax, subject_data.shape[-1])
    # print(x)
    y = subject_data.flatten()
    
    # get only negative part of signal
    y_negative = [abs(y_item) if y_item < threshold else 0 for y_item in y]
    
    # calculate area under the signal
    area = abs(simpson(y_negative, x))
    
    if area != 0.0:
        fractional_area = area * fraction
    
        # search for latency point (x) which split area according to fraction provided 
        current_area = 0
        fractional_area_index = 0
        i = 2
        while abs(simpson(y_negative[:i], x[:i])) <= fractional_area:
            current_area = abs(simpson(y_negative[:i], x[:i]))
            fractional_area_index = i
            i+=1
            
        # print(f'{fractional_area_index}; {x[fractional_area_index]}')
        # print(x)
        
        return (fractional_area_index, x[fractional_area_index])    
    else:
        print('No area detected')
        return (None, None) 

Parameters

In [None]:
roi = ['Fz']
tmin = -0.05
tmax = 0.2
threshold = 1*1e-6

Estimate fractional area latency

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_df.to_dict()))

In [None]:
X = Pipeline([
    ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=tmin, tmax=tmax)),
    ("average", Evoked(condition='error_response')),
]).fit_transform(epochs_df_copy)

X = X[['evoked']].to_numpy()

In [None]:
fractional_latencies = []
for i in range(0, len(X)):        
    subject = X[i][0]
    evoked = subject
    # print(f"Index: {i}")
    this_latency = fractional_negative_area_latency(evoked, fraction=0.5, tmin=tmin, tmax=tmax, threshold=threshold)
    fractional_latencies.append(this_latency)

In [None]:
fractional_latencies_ern = list(map(lambda x: x[1] ,fractional_latencies))
fractional_latencies_ern = np.array(fractional_latencies_ern).reshape(-1,1)
fractional_latencies_ern.shape

Fractional area latency - CRN

Parameters: threshold at $2 \mu V$

In [None]:
roi = ['Fz']
tmin = -0.05
tmax = 0.2
threshold = 2*1e-6 # przy tym thresholdzie nie lapiemy wszystkich osob 1= 7 os; 2 = wszyscy

Estimate fractional area latency

In [None]:
epochs_df_copy = pd.DataFrame(copy.deepcopy(epochs_df.to_dict()))

In [None]:
X = Pipeline([
    ("channels_extraction",PickChannels(channels_list=roi)),
    ("trim", EpochTrim(tmin=tmin, tmax=tmax)),
    ("average", Evoked(condition='correct_response')),
]).fit_transform(epochs_df_copy)

X = X[['evoked']].to_numpy()

In [None]:
fractional_latencies = []
for i in range(0, len(X)):        
    subject = X[i][0]
    evoked = subject
    # print(f"Index: {i}")
    this_latency = fractional_negative_area_latency(evoked, fraction=0.5, tmin=tmin, tmax=tmax, threshold=threshold)
    fractional_latencies.append(this_latency)

In [None]:
fractional_latencies_crn_2uV = list(map(lambda x: x[1] ,fractional_latencies))
fractional_latencies_crn_2uV = np.array(fractional_latencies_crn_2uV).reshape(-1,1)
fractional_latencies_crn_2uV.shape

---

#### Extract anxiety-related questionnaires scores

Questionnaires to include in analysis

In [None]:
rumination = "RRQ"
dass_anxiety = "DASS-21 Anx"
dass_stress = "DASS-21 Stress"
dass_dep = "DASS-21 Dep"
stai_t = "STAI-T" 
stai_s_diff = "STAI-S Diff" 
uninhibited_responses = "uninhibited response"
inhibited_responses = "inhibited response"
bis = "BIS"
bas_dzialanie = "BAS_D"
bas_przyjemnosc = "BAS_PRZY"
bas_nagroda = "BAS_NAG"
washing = "WASH"
obsessing = "OBSESS"
hoarding = "HOARD"
ordering = "ORD"
checking = "CHECK"
neutralizing = "NEU"
oci_r_full = "OCI-R"
threat = "OT"
thought_suppression = "WBSI"
indecisivness = "INDEC_F"
IU_prospecitve = "IUS-P"
IU_inhibitory = "IUS-I"
self_esteem = "SES"
punishment_sensitivity = "PUN"
reward_sensitivity = "REW"
harm_responsibility = "HARM"
thought_control = "T-CTR"
perfectionism_IU = "OB_PERF"
# perfectionism_cmda = "17-Perfectionism CMDA"
perfectionism_ps = "PS"
guilt_sensitivity = "G_SE"
intolerance_ambiguity = "AMB"
predictability = "PRED"
high_standards = "STAND"

In [None]:
scales = [
    rumination,
    # dass_anxiety,
    dass_stress,
    dass_dep,
    stai_t,
    stai_s_diff,
    uninhibited_responses,
    inhibited_responses,
    bis,
    bas_dzialanie,
    bas_przyjemnosc,
    bas_nagroda,
    washing,
    obsessing,
    hoarding,
    ordering,
    checking,
    neutralizing,
    # oci_r_full,
    threat,
    thought_suppression,
    indecisivness,
    punishment_sensitivity,
    reward_sensitivity,
    harm_responsibility,
    guilt_sensitivity,
    thought_control,
    perfectionism_IU,
    # perfectionism_cmda,
    perfectionism_ps,
    intolerance_ambiguity,
    predictability,
    high_standards,
    IU_prospecitve,
    IU_inhibitory,
    self_esteem,
]

In [None]:
questionnaires_scores_df = epochs_df[scales]
questionnaires_scores_df

Fill missing value from external file - TODO to automatisation

In [None]:
questionnaires_scores_df[questionnaires_scores_df.isna().any(axis=1)]

In [None]:
if test:
    questionnaires_scores_df.at[102, 'uninhibited response'] = 14.0
    questionnaires_scores_df.at[102, 'inhibited response'] = 98.0
else:
    print('None to fill')

In [None]:
questionnaires_scores_df[questionnaires_scores_df.isna().any(axis=1)]

Create performance metric based on inhibited and uninhibited responses

In [None]:
questionnaires_scores_df['performance'] = questionnaires_scores_df['inhibited response'] / questionnaires_scores_df['uninhibited response']
questionnaires_scores_df = questionnaires_scores_df.drop(columns=['inhibited response', 'uninhibited response'])

Inspect data

In [None]:
questionnaires_scores_df[questionnaires_scores_df.isna().any(axis=1)]

In [None]:
with pd.option_context('display.max_colwidth', None, 'display.max_columns', None):
    display(questionnaires_scores_df.describe())

#### Demographical data

In [None]:
age = "Age"
sex = "Sex"
handness = "Handness"

In [None]:
scales = [
    age,
    # sex,
    handness
]

In [None]:
demographical_scores_df =  epochs_df[scales].astype(float)
demographical_scores_df = demographical_scores_df.rename(columns={'Handness': 'Handedness'})

In [None]:
demographical_scores_df.describe()

In [None]:
# demographical_scores_df['Sex'].value_counts()

#### Concatenate questionnaire, covariates, and EEG features

In [None]:
if test:
    fractional_latencies_crn_2uV[fractional_latencies_crn_2uV == None] = np.nan
    fractional_latencies_crn_2uV_df = pd.DataFrame(fractional_latencies_crn_2uV)
    fractional_latencies_crn_2uV_df.fillna(value = np.nanmean(fractional_latencies_crn_2uV), inplace=True)
    fractional_latencies_crn_2uV = fractional_latencies_crn_2uV_df.to_numpy()

In [None]:
results_df = pd.DataFrame()

eeg_features_df = pd.DataFrame({
    'ERN' : preprocessed_X_ern.flatten() * 1000000,
    'ERN lat' : fractional_latencies_ern.flatten() * 1000,
    'CRN' : preprocessed_X_crn.flatten() * 1000000,
    'CRN lat' : fractional_latencies_crn_2uV.flatten() * 1000,

    
})

results_df = pd.concat([questionnaires_scores_df, demographical_scores_df, eeg_features_df], axis=1)
# results_df.to_pickle(f"../data/behavioral/all_variables_{dataset}.pkl")
results_df.head()

In [None]:
column_names_mapping = {
    "STAI-T": "STAI-T",
    "STAI-S Diff": 'STAI-S Diff',
    "DASS-21 Stress": "DASS-21 Stress",
    "DASS-21 Dep": "DASS-21 Dep",
    "RRQ": "RRQ",
    "BIS": "BIS",
    'BAS_D': "BAS Drive",
    'BAS_PRZY': "BAS R-R",
    'BAS_NAG': "BAS F-S",
    'PUN': "Punishment",
    'REW': "Reward",
    "CHECK": "Checking",
    "HOARD": "Hoarding",
    "OBSESS": "Obsessing",
    "ORD": "Ordering",
    'NEU': 'Neutralizing',
    "WASH": "Washing",
    'INDEC_F': "Indecisivness",
    "WBSI": "WBSI",
    "IUS-P": "IUS-P",
    "IUS-I": "IUS-I",
    'HARM': "Harm-R",
    "OT": "OT",
    'T-CTR': "T-CTR",
    'OB_PERF': "Perfectionism",
    'PS': "PS",
    'G_SE': "Guilt-S",
    'AMB': "Ambiguity-A",
    'PRED': "Predictability",
    'STAND': "H-Standards",  
    "SES": "SES",
    "Age": "Age",
    "Handness": "Handness",
    'performance': "Performance",
    'ERN': "ERN",
    'ERN lat': 'ERN latency',
    'CRN': "CRN",
    'CRN lat': 'CRN latency',
}

In [None]:
with pd.option_context('display.max_colwidth', None, 'display.max_columns', None):
    display(results_df.rename(columns=column_names_mapping).describe().round(decimals=2).transpose())
    results_df.rename(columns=column_names_mapping).describe().round(decimals=2).transpose().to_csv(f'../data/behavioral/all_variables_{dataset}_describe.csv')

In [None]:
results_df_renamed = results_df.rename(columns=column_names_mapping)

In [None]:
df = results_df_renamed.copy()

df["DASS-21 Stress"] = (df["DASS-21 Stress"] - 1)/3
df["DASS-21 Dep"] = (df["DASS-21 Dep"] - 1)/3
df["STAI-T"] = (df["STAI-T"] -1 ) /3
df["BIS"] = (df["BIS"]-1 ) /3
df["BAS Drive"] = (df["BAS Drive"]-1 ) /3
df["BAS R-R"] = (df["BAS R-R"]-1 ) /3
df["BAS F-S"] = (df["BAS F-S"]-1 ) /3
df["RRQ"] = (df["RRQ"]-1 ) /4
df["WBSI"] = (df["WBSI"]-1 ) /4
df["OT"] = (df["OT"] -1) /6
df["IUS-P"] = (df["IUS-P"]-1 ) /4
df["IUS-I"] = (df["IUS-I"]-1 ) /4
df["Checking"] = (df["Checking"]-1 ) /4
df["Hoarding"] = (df["Hoarding"]-1 ) /4
df["Obsessing"] = (df["Obsessing"]-1 ) /4
df["Ordering"] = (df["Ordering"]-1 ) /4
df["Neutralizing"] = (df["Neutralizing"]-1 ) /4
df["Washing"] = (df["Washing"]-1 ) /4
df["SES"] = (df["SES"]-1 ) /3

df['Punishment'] = df['Punishment'] / 21
df['Reward'] = df['Reward'] / 21
df['Indecisivness'] = (df['Indecisivness'] - 1) / 4
df['Harm-R'] = (df['Harm-R'] - 1) /6
df['T-CTR'] = (df['T-CTR'] - 1) /6
df['Perfectionism'] = (df['Perfectionism'] -1 ) /6
df['PS'] = (df['PS'] - 1) / 4
df['Guilt-S'] = (df['Guilt-S'] - 1) /4
df['Ambiguity-A'] = (df['Ambiguity-A'] - 1) /5
df['Predictability'] = (df['Predictability'] - 1) /5
df['H-Standards'] = (df['H-Standards'] - 1) /6

In [None]:
with pd.option_context('display.max_colwidth', None, 'display.max_columns', None):
    display(df.describe().round(decimals=2).transpose())
    # df.describe().round(decimals=2).transpose().to_csv(f'../data/behavioral/all_variables_{dataset}_describe_normalized.csv')

## Calculate differences between testing and training sets

In [None]:
df_training = df # done

In [None]:
df_testing = df 

In [None]:
for column in df_training:
    variable_training = df_training[column].to_numpy().astype(float)
    variable_testing = df_testing[column].to_numpy().astype(float)
    
    stats, p_val = scipy.stats.ttest_ind(variable_training, variable_testing)
    print(f'---: \n  {column}     \n  p-value: {p_val}')
    if p_val < 0.05:
        print('Different')

## Distributions of scores

In [None]:
# Create an empty list to store the transformed data
transformed_data_training = []

# Iterate through each row of the original DataFrame
for index, row in df_training.iterrows():
    # Iterate through each column (questionnaire scale)
    for scale_name, score in row.items():
        # Append a dictionary containing score and scale name to the transformed data list
        transformed_data_training.append({'score': score, 'scale': scale_name, 'dataset': 'training'})

# Create a new DataFrame from the transformed data list
transformed_data_training_df = pd.DataFrame(transformed_data_training)

In [None]:
transformed_data_training_df

In [None]:
# Create an empty list to store the transformed data
transformed_data_testing = []

# Iterate through each row of the original DataFrame
for index, row in df_testing.iterrows():
    # Iterate through each column (questionnaire scale)
    for scale_name, score in row.items():
        # Append a dictionary containing score and scale name to the transformed data list
        transformed_data_testing.append({'score': score, 'scale': scale_name, 'dataset': 'testing'})

# Create a new DataFrame from the transformed data list
transformed_data_testing_df = pd.DataFrame(transformed_data_testing)

In [None]:
transformed_data_testing_df

In [None]:
variables_df = pd.concat([transformed_data_training_df, transformed_data_testing_df], ignore_index=True)
variables_df.head()

In [None]:
# set(variables_df_renamed['scale'].to_numpy())

Rename scales

In [None]:
scales_names_mapping = {
    "STAI-T": "Anxiety trait",
    "STAI-S Diff": 'Affective load',
    "DASS-21 Stress": "Stress",
    "DASS-21 Dep": "Depression",
    "RRQ": "Rumination",
    "BIS": "Behavioral inhibition",
    'BAS Drive': "Drive BAS",
    'BAS R-R': "Reward responsiveness BAS",
    'BAS F-S': "Fun-seeking BAS",
    'Punishment': "Punishment sensitivity",
    'Reward': "Reward sensitivity",
    "Checking": "Checking",
    "Hoarding": "Hoarding",
    "Obsessing": "Obsessing",
    "Ordering": "Ordering",
    'Neutralizing': 'Neutralizing',
    "Washing": "Washing",
    'Indecisivness': "Indecisiveness",
    "WBSI": "Thought supression",
    "IUS-P": "Prospective IU",
    "IUS-I": "Inhibitory IU",
    'Harm-R': "Inflated harm responsibility",
    "OT": "Threat overestimation",
    'T-CTR': "Importance of thought control",
    'Perfectionism': "Perfectionism/IU",
    'PS': "Personal standards",
    'Guilt-S': "Guilt sensitivity",
    "Ambiguity-A": "Avoidance of ambiguity",
    'Predictability': "Need for predictability",
    'H-Standards': "High standards",  
    "SES": "Self-esteem",
    "Age": "Age",
    "Handness": "Handness",
    'Performance': "Performance",
    'ERN': "ERN amplitude",
    'ERN latency': 'ERN latency',
    'CRN': "CRN amplitude",
    'CRN latency': 'CRN latency',
}
variables_df_renamed = variables_df.copy()
variables_df_renamed['scale'] = variables_df_renamed['scale'].replace(scales_names_mapping)

In [None]:
order = ['Rumination', 'Stress', 'Depression', 'Anxiety trait', 'Affective load', 'Behavioral inhibition', 'Obsessing', 'Hoarding' , 'Ordering', 'Checking', 'Washing', 
         'Neutralizing', 'Thought supression', 'Prospective IU', 'Inhibitory IU', 'Self-esteem', 'Drive BAS', 'Fun-seeking BAS', 'Reward responsiveness BAS', 'Indecisiveness',
        'Punishment sensitivity', 'Reward sensitivity', 'Inflated harm responsibility', 'Importance of thought control', 'Threat overestimation', 'Perfectionism/IU', 
        'Personal standards', 'Guilt sensitivity', 'Avoidance of ambiguity', 'Need for predictability', 'High standards', 'Age', 'Handedness', 'Performance', 'ERN latency', 
        'CRN latency', 'ERN amplitude', 'CRN amplitude']

sns.set_style("ticks")

cm = 1/2.54
dpi = 500

plt.rcParams['figure.dpi'] = dpi
plt.rcParams['figure.dpi'] = dpi
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['axes.labelsize'] = 8
plt.rcParams["axes.edgecolor"] = ".15"
plt.rcParams["axes.linewidth"]  = 0.5
plt.rcParams["axes.linewidth"]  = 0.5

colors = [sns.color_palette("colorblind")[0], sns.color_palette("colorblind")[1], 'gray']
sns.set_palette(colors)

# custom_palette = ["#4dac26",  '#d01c8b']
# sns.set_palette(custom_palette)
# sns.set_palette('colorblind')



g = sns.FacetGrid(
    variables_df_renamed, 
    col='scale', 
    col_wrap=5,
    sharex=False,
    sharey=False,
    despine=False,
    height=3.5*cm,
    aspect= 1.5,
    # subplot_kws={"xlim":(0,1)},
    legend_out=False,
    hue='dataset',
    col_order=order,
)

g.map_dataframe(
    sns.histplot, 
    x="score",
    kde=True,
    cbar_kws={'saturation':0.9}
)

g.add_legend()

fig = g.fig
fig.set_size_inches(19*cm, 30*cm)

g.fig.subplots_adjust(wspace=.2, hspace=.55)
g.set_ylabels(label=None, clear_inner=True)
g.set_titles(template="{col_name}")

sns.move_legend(
    g, "lower center",
    bbox_to_anchor=(.5, -0.04), ncol=2, title=None, frameon=False,
)
plt.setp(g._legend.get_texts(), fontsize=9)

plt.show()
fig.savefig(f'../data/scales_density/train_test_scales_density', bbox_inches='tight', pad_inches=0.01)

## Demographical data of the whole sample

In [None]:
opus_demo_df =  pd.read_csv('../data/scales/all_scales_with_rt.csv')
sonata_demo_df = pd.read_csv('../data/scales/Sonata_scales.csv', dtype={'Demo_kod': object})

In [None]:
opus_demo_df.shape

Gender

In [None]:
display(opus_demo_df.groupby('Płeć').describe())
display(sonata_demo_df.groupby('Płeć').describe())

Age

In [None]:
age_df = pd.concat([opus_demo_df[['Wiek']], sonata_demo_df[['Wiek']]], ignore_index=True)
display(age_df.describe())

Education

In [None]:
edu_df = pd.concat([opus_demo_df[['Dotychczasowa liczba lat edukacji']], sonata_demo_df[['Twoja dotychczasowa liczba lat edukacji (w pełnych latach)']]], ignore_index=True)
display(edu_df.describe())

#### Opus rejected individuals

In [None]:
opus_q_ids = pd.read_csv('../data/scales/all_scales_with_rt.csv',dtype={'Demo_kod': object})['Demo_kod'].to_numpy()
opus_q_ids

In [None]:
dir_ = '../data/responses_100_600/'
opus_eeg_ids = []

for file in sorted(glob.glob(dir_ + "*.vhdr")):
    re_match = re.search('GNG_(.*)-64', file)
    id_ = re_match.groups()[0]
    opus_eeg_ids.append(id_)
opus_eeg_ids = np.array(opus_eeg_ids)

In [None]:
opus_eeg_ids

In [None]:
list(set(opus_q_ids).difference(opus_eeg_ids))

#### Sonata rejected individuals

In [None]:
sonata_q_ids = pd.read_csv('../data/scales/Sonata_scales.csv', dtype={'Demo_kod': object})['Demo_kod'].to_numpy()
sonata_q_ids

In [None]:
dir_ = '../data/responses_100_600_sonata/'
sonata_eeg_ids = []

for file in sorted(glob.glob(dir_ + "*.vhdr")):
    re_match = re.search('.*-GNG-(.*)_', file)
    id_ = re_match.groups()[0]
    sonata_eeg_ids.append(id_)
sonata_eeg_ids = np.array(sonata_eeg_ids)

In [None]:
sonata_eeg_ids

In [None]:
list(set(sonata_q_ids).difference(sonata_eeg_ids))

## Demo infos about loaded sample

In [None]:
ids = ID_train_list + ID_test_list

In [None]:
opus_edu_df = opus_demo_df[['Dotychczasowa liczba lat edukacji', 'Demo_kod']].rename(columns={'Dotychczasowa liczba lat edukacji': 'Education'})
sonata_edu_df = sonata_demo_df[['Twoja dotychczasowa liczba lat edukacji (w pełnych latach)', 'Demo_kod']].rename(columns={'Twoja dotychczasowa liczba lat edukacji (w pełnych latach)': 'Education'})

edu_df = pd.concat([opus_edu_df, sonata_edu_df], ignore_index=True)
edu_loaded_sample_df = edu_df[edu_df['Demo_kod'].isin(ids)]

In [None]:
edu_loaded_sample_df['Education'].iloc[47] = 13.5
edu_loaded_sample_df['Education'] = pd.to_numeric(edu_loaded_sample_df['Education'])
edu_loaded_sample_df.describe()

## Calculate Cronbach's alpha for questionnaries

In [None]:
dir_ = f"../data/scales/net_scales/"

scales = []

for file in sorted(glob.glob(dir_ + "*.csv")):
    print(file)
    scale_df = pd.read_csv(file)
    scales.append(scale_df)

    columns_to_drop = [col for col in scale_df.columns if ('kod' in col.lower()) or ('ID' in col)]
    scale_df.drop(columns=columns_to_drop, inplace=True)
    scale_df = scale_df.apply(pd.to_numeric, errors='ignore')

    print(pg.cronbach_alpha(data=scale_df))