## Behavioral data summary

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, ttest_rel
import matplotlib.pyplot as plt
import os
import json

## Read pickled data (full dataset)

In [2]:
data_df = pd.read_pickle('../data/full_sample_sonata_dataset.pkl')
data_df.head()

Unnamed: 0,trial number,ID,block_type,trial_type,cue_name,target_name,response,rt,reaction,threshold_rt,...,feedback_show_time,feedback_type,event,drop_log,ne_FCz,ne_mean_FCz,ne_Fz,ne_mean_Fz,ne_Cz,ne_mean_Cz
0,1,FLA-169,experiment,congruent,,congruent_lll,l,0.268716,correct,0.42,...,0.8,feedback_good,2,(),-1.392086e-05,-5e-06,-1.047375e-05,-6.15665e-06,-8.70229e-06,-7.819865e-07
1,2,FLA-169,experiment,congruent,,congruent_lll,l,0.309487,correct,0.344358,...,0.8,feedback_good,2,(),-2.783326e-06,9e-06,-5.018437e-07,3.766289e-06,-5.263218e-07,8.307203e-06
2,3,FLA-169,experiment,incongruent,,incongruent_rlr,l,0.397239,correct,0.309487,...,0.8,feedback_bad,0,(),-3.684688e-06,3e-06,-3.4494e-06,-1.556821e-06,-1.47444e-06,4.761666e-06
3,4,FLA-169,experiment,congruent,,congruent_rrr,r,0.309485,correct,0.353363,...,0.8,feedback_good,2,(),7.47574e-07,6e-06,7.906369e-07,4.928352e-06,2.810129e-06,6.71282e-06
4,5,FLA-169,experiment,incongruent,,incongruent_lrl,r,0.386673,correct,0.309487,...,0.8,feedback_bad,0,(),-5.791032e-06,-2e-06,-2.980749e-06,-9.579916e-07,-2.809617e-06,-8.727314e-07


#### Prepare dataframe

In [3]:
# rename trial number column
columns_name_mapping = {'trial number': 'trial_number'}
data_df = data_df.rename(columns=columns_name_mapping)

# cast eeg to uV from V
data_df.loc[:, data_df.columns.str.contains('ne')] *= 1000000

# add column with numerical indicator of accuracy
data_df['acc'] = np.select(
    [data_df['reaction'] == 'incorrect', data_df['reaction'] == 'correct', pd.isna(data_df['reaction'])],
    [-1, 1, np.nan],
    default=np.nan
)

# add column with numerical indicator of condition
data_df['condition'] = np.select(
    [data_df['trial_type'] == 'congruent', data_df['trial_type'] == 'incongruent', pd.isna(data_df['trial_type'])],
    [1, -1, np.nan],
    default=np.nan
)

# add column with condition index
data_df['condition_index'] = np.select(
    [data_df['trial_type'] == 'congruent', data_df['trial_type'] == 'incongruent', pd.isna(data_df['trial_type'])],
    [1, 2, np.nan],
    default=np.nan
)

# add column with pre accuracy and pre eeg info
columns_to_shift = ['acc', 'ne_Fz', 'ne_FCz', 'ne_Cz', 'ne_mean_Fz', 'ne_mean_FCz', 'ne_mean_Cz']
for col in columns_to_shift:
    data_df[f'pre_{col}'] = data_df[col].shift(1)

# add column with log RT
data_df['log_rt'] = np.log(data_df['rt'])

## Data cleaning

### 1. Mark bad participants - zero clean trials at channel

In [4]:
channel = 'FCz'
cleaned_data_df = data_df.copy(deep=True)

ids = np.unique(data_df['ID'])
n_clear_trails = 1

for participant_id in ids:
    participant_data = data_df[data_df['ID'] == participant_id][f'ne_{channel}'].to_numpy()
    good_trials = np.count_nonzero(~np.isnan(participant_data.flatten()))
    
    if good_trials < n_clear_trails:
        print(f"Participant {participant_id} has {good_trials} good trial. Rejecting")
        cleaned_data_df = cleaned_data_df[cleaned_data_df['ID'] != participant_id]        
    
cleaned_data_df = cleaned_data_df.reset_index()

Participant FLA-222 has 0 good trial. Rejecting


### 2. Mark participants who have less than 6 error trials

In [5]:
ids = np.unique(cleaned_data_df['ID'])
n_error = 6

for participant_id in ids:
    participant_data = cleaned_data_df[cleaned_data_df['ID'] == participant_id]
    error_trials = participant_data[(participant_data['reaction'] == 'incorrect') & 
                                    (participant_data['ne_FCz'].notna())]
    
    if len(error_trials) < n_error:
        print(f"Participant {participant_id} has {len(error_trials)} erroneous trial. Rejecting")
        cleaned_data_df = cleaned_data_df[cleaned_data_df['ID'] != participant_id]

cleaned_data_df = cleaned_data_df.reset_index()

Participant FLA-020 has 1 erroneous trial. Rejecting
Participant FLA-124 has 5 erroneous trial. Rejecting


### 3. Apply trial rejections
- Select **X** trials that follow **CCXP** sequence
- Remove **P** trails that exceed +/- 3 *SD* of log(RT)

In [6]:
# Function to check if rt exceeds 3*std
def log_threshold(row, participant_means, participant_std):
    participant_id = row['ID']
    mean_log_rt = participant_means[participant_id]
    std_log_rt = participant_std[participant_id]
    upper_bound = mean_log_rt + 3*std_log_rt
    lower_bound = mean_log_rt - 3*std_log_rt
    
    return (row['log_rt'] > upper_bound or row['log_rt'] < lower_bound), np.exp(upper_bound), np.exp(lower_bound)

In [7]:
cleaned_data_df_copy = cleaned_data_df.copy()
final_df = pd.DataFrame()

# create mapping IDs to ordinal 1-N number
id_mapping = {old_id: new_id for new_id, old_id in enumerate(cleaned_data_df_copy['ID'].unique(), start=1)}

# 1. Mark and remove trials where rt exceed 1s
cleaned_data_df_copy['rt_greater_than_1'] = cleaned_data_df_copy.apply(
    lambda row: (True if row['rt'] > 1 else False),
    axis=1
)
cleaned_data_df_copy.loc[cleaned_data_df_copy['rt_greater_than_1'] == True, ['rt', 'reaction']] = np.nan

# 2. Mark trials that exceed log rt threshold - participant-wise

# Calculate the mean and std of log_rt for each participant
participant_stats = cleaned_data_df_copy.groupby('ID')['log_rt'].agg(['mean', 'std']).rename(columns={'mean': 'log_rt_mean', 'std': 'log_rt_std'})
cleaned_data_df_copy = cleaned_data_df_copy.merge(participant_stats, on='ID')

# Define the thresholds
upper_threshold = cleaned_data_df_copy['log_rt_mean'] + 3*cleaned_data_df_copy['log_rt_std']
lower_threshold = cleaned_data_df_copy['log_rt_mean'] - 3*cleaned_data_df_copy['log_rt_std']

# Mark trials that exceed log rt threshold - participant-wise
cleaned_data_df_copy['log_rt_exceed_ub'] = cleaned_data_df_copy['log_rt'] > upper_threshold
cleaned_data_df_copy['log_rt_exceed_lb'] = cleaned_data_df_copy['log_rt'] < lower_threshold
cleaned_data_df_copy['log_rt_exceed_threshold'] = cleaned_data_df_copy['log_rt_exceed_ub'] | cleaned_data_df_copy['log_rt_exceed_lb']

# 3. Mark trials that exceed rt threshold - globally
global_log_rt = cleaned_data_df_copy['log_rt'].to_numpy()
rt_upper_bound = np.nanmean(global_log_rt) + 3*np.nanstd(global_log_rt)
rt_lower_bound = np.nanmean(global_log_rt) - 3*np.nanstd(global_log_rt)
print(f"Group-level RT upper bound: {np.exp(rt_upper_bound)}, RT lower bound: {np.exp(rt_lower_bound)}")

cleaned_data_df_copy['global_log_rt_exceed_threshold'] = cleaned_data_df_copy.apply(
    lambda row: (True if np.log(row['rt']) > rt_upper_bound or np.log(row['rt']) < rt_lower_bound else False),
    axis=1
)

# 4. Mark if trial P is in a clean CCX(P) sequence
for idx, row in cleaned_data_df_copy.iterrows():
    if idx-3 >= 0:
        if ((cleaned_data_df_copy.iloc[idx-3]['reaction'] == 'correct') and 
            (cleaned_data_df_copy.iloc[idx-2]['reaction'] == 'correct') and 
            (not pd.isna(cleaned_data_df_copy.iloc[idx-1]['ne_Fz'])) and 
            (not pd.isna(cleaned_data_df_copy.iloc[idx-1]['reaction'])) and
            (not cleaned_data_df_copy.iloc[idx]['global_log_rt_exceed_threshold']) and
            (not pd.isna(cleaned_data_df_copy.iloc[idx]['rt']))):
                is_in_sequence = True
        else:
            is_in_sequence = False
    else:
        is_in_sequence = False

    # add trial to final dataframe and mark if it is P in a clean CCX(P) sequence
    row_data = pd.DataFrame({
        'trial_number': [row['trial_number']],
        'ID': int(row['ID'].split("-")[-1]),
        'participant_index': [id_mapping[row['ID']]],
        'condition': row['condition'],
        'condition_index': [row['condition_index']],
        'rt': row['rt'],
        'acc': row['acc'],
        'ne_Fz': row['ne_Fz'],
        'ne_FCz': row['ne_FCz'],
        'ne_mean_Fz': row['ne_mean_Fz'],
        'ne_mean_FCz': row['ne_mean_FCz'],
        'y':  row['rt'] *  row['acc'],
        'pre_ne_Fz': row['pre_ne_Fz'],
        'pre_ne_FCz': row['pre_ne_FCz'],
        'pre_ne_mean_Fz': row['pre_ne_mean_Fz'],
        'pre_ne_mean_FCz': row['pre_ne_mean_FCz'],
        'pre_acc': row['pre_acc'],
        'rt_greater_than_1': row['rt_greater_than_1'],
        'log_rt_exceed_threshold': row['global_log_rt_exceed_threshold'],
        'is_in_sequence': is_in_sequence,
    })

    final_df = pd.concat([final_df, row_data], ignore_index=True)

Group-level RT upper bound: 0.6253680985158315, RT lower bound: 0.1189696203517977


In [8]:
final_df

Unnamed: 0,trial_number,ID,participant_index,condition,condition_index,rt,acc,ne_Fz,ne_FCz,ne_mean_Fz,ne_mean_FCz,y,pre_ne_Fz,pre_ne_FCz,pre_ne_mean_Fz,pre_ne_mean_FCz,pre_acc,rt_greater_than_1,log_rt_exceed_threshold,is_in_sequence
0,1,169,1,1.0,1.0,0.268716,1.0,-10.473746,-13.920862,-6.156650,-5.222685,0.268716,,,,,,False,False,False
1,2,169,1,1.0,1.0,0.309487,1.0,-0.501844,-2.783326,3.766289,9.476191,0.309487,-10.473746,-13.920862,-6.156650,-5.222685,1.0,False,False,False
2,3,169,1,-1.0,2.0,0.397239,1.0,-3.449400,-3.684688,-1.556821,2.632504,0.397239,-0.501844,-2.783326,3.766289,9.476191,1.0,False,False,False
3,4,169,1,1.0,1.0,0.309485,1.0,0.790637,0.747574,4.928352,6.436701,0.309485,-3.449400,-3.684688,-1.556821,2.632504,1.0,False,False,True
4,5,169,1,-1.0,2.0,0.386673,1.0,-2.980749,-5.791032,-0.957992,-1.689976,0.386673,0.790637,0.747574,4.928352,6.436701,1.0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66595,296,170,222,-1.0,2.0,0.260115,-1.0,-9.048478,-18.203901,-4.820889,-10.670520,-0.260115,-0.221143,1.032869,1.291188,3.063216,1.0,False,False,True
66596,297,170,222,1.0,1.0,0.292400,1.0,-11.867870,-14.539917,-4.864914,-4.860802,0.292400,-9.048478,-18.203901,-4.820889,-10.670520,-1.0,False,False,True
66597,298,170,222,-1.0,2.0,0.400546,1.0,-2.590526,-1.705890,1.400951,2.956393,0.400546,-11.867870,-14.539917,-4.864914,-4.860802,1.0,False,False,False
66598,299,170,222,1.0,1.0,0.250364,1.0,-4.050993,-1.294921,3.614124,6.908545,0.250364,-2.590526,-1.705890,1.400951,2.956393,1.0,False,False,False


### 4. Center and standardize eeg signal

In [29]:
def standardize(series):
    return (series - series.mean()) / series.std()

# standardize all columns that contains 'ne'
ne_columns = final_df.columns[final_df.columns.str.contains('ne')]

# Standardize all 'ne' columns within each participant group
for col in ne_columns:
    standardized_col_name = f'{col}_standardized'
    final_df[standardized_col_name] = final_df.groupby('ID')[col].transform(standardize)

### 5. Remove trials not in a sequence

In [13]:
clean_df = final_df.copy().dropna()
clean_df_only_sequence = clean_df[clean_df['is_in_sequence'] == True]

# save the dataframe
# clean_df_only_sequence.to_pickle('../data/sonata_data_standardized.pkl')
# clean_df_only_sequence.to_csv('../data/sonata_data_standardized.csv')

In [14]:
clean_df_only_sequence.shape

(50419, 20)

## Check RTs and number of trials in the final sample


- error vs correct RT

In [15]:
display(clean_df_only_sequence[['rt']].describe())
display(clean_df_only_sequence.groupby('acc')['rt'].describe())

ttest_ind(
    clean_df_only_sequence[clean_df_only_sequence['acc'] == -1]['rt'],
    clean_df_only_sequence[clean_df_only_sequence['acc'] == 1]['rt'],
    nan_policy='omit'
)

Unnamed: 0,rt
count,50419.0
mean,0.284504
std,0.072523
min,0.119373
25%,0.236488
50%,0.267513
75%,0.323863
max,0.62518


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,5538.0,0.233429,0.04676,0.119373,0.208695,0.229692,0.252928,0.622543
1.0,44881.0,0.290806,0.072644,0.119548,0.241376,0.271757,0.333375,0.62518


TtestResult(statistic=-57.33057725043296, pvalue=0.0, df=50417.0)

- incongruent vs congruent

In [41]:
display(clean_df_only_sequence.groupby('condition')['rt'].describe())

ttest_ind(
    clean_df_only_sequence[clean_df_only_sequence['condition'] == -1]['rt'],
    clean_df_only_sequence[clean_df_only_sequence['condition'] == 1]['rt'],
    nan_policy='omit'
)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,15141.0,0.324791,0.087509,0.119373,0.24696,0.333999,0.380049,0.62518
1.0,35278.0,0.267213,0.056875,0.119548,0.234085,0.257746,0.289836,0.623425


TtestResult(statistic=87.73194176052912, pvalue=0.0, df=50417.0)

**With aggregation within participants**

- error vs correct RT

In [16]:
participant_means = clean_df_only_sequence.groupby(['ID'])[['rt']].mean().reset_index()
display(participant_means[['rt']].describe())

participant_means = clean_df_only_sequence.groupby(['ID', 'acc'])['rt'].mean().reset_index()
display(participant_means.groupby('acc')['rt'].describe())

ttest_rel(
    participant_means[participant_means['acc'] == -1]['rt'].to_numpy(),
    participant_means[participant_means['acc'] == 1]['rt'].to_numpy(),
    nan_policy='omit'
)

Unnamed: 0,rt
count,222.0
mean,0.282482
std,0.032518
min,0.185684
25%,0.264951
50%,0.278432
75%,0.301589
max,0.440957


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,222.0,0.236826,0.02799,0.17619,0.221423,0.234223,0.24752,0.483357
1.0,222.0,0.288001,0.032214,0.189244,0.270777,0.284826,0.306078,0.444949


TtestResult(statistic=-39.98778796467779, pvalue=3.749224705127767e-103, df=221)

- incongruent vs congruent RT

In [17]:
participant_means = clean_df_only_sequence.groupby(['ID', 'condition'])['rt'].mean().reset_index()
display(participant_means.groupby('condition')['rt'].describe())

ttest_rel(
    participant_means[participant_means['condition'] == -1]['rt'].to_numpy(),
    participant_means[participant_means['condition'] == 1]['rt'].to_numpy(),
    nan_policy='omit'
)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,222.0,0.321418,0.041257,0.18433,0.297411,0.322165,0.345328,0.469574
1.0,222.0,0.265819,0.030395,0.186242,0.247501,0.261671,0.281904,0.427454


TtestResult(statistic=38.08427859078629, pvalue=4.617025562240653e-99, df=221)

- Number of trials in the analysis

In [19]:
# Count the number of errors and correct responses for each participant
error_correct_counts = clean_df_only_sequence.groupby(['ID', 'acc']).size().unstack(fill_value=0)
error_correct_counts.columns = ['errors', 'correct']

average_counts = error_correct_counts.describe()
display(average_counts)

display(clean_df_only_sequence.groupby(['ID'])[['rt']].count().describe())

Unnamed: 0,errors,correct
count,222.0,222.0
mean,24.945946,202.166667
std,8.261931,37.463637
min,5.0,107.0
25%,19.25,180.0
50%,25.0,207.5
75%,31.0,227.75
max,42.0,276.0


Unnamed: 0,rt
count,222.0
mean,227.112613
std,30.666516
min,147.0
25%,209.25
50%,233.0
75%,249.0
max,284.0


## Check ERN and CRN amplitudes in the final sample

- ERN vs CRN

In [20]:
display(clean_df_only_sequence.groupby('acc')['ne_FCz'].describe())

ttest_ind(
    clean_df_only_sequence[clean_df_only_sequence['acc'] == -1]['ne_FCz'],
    clean_df_only_sequence[clean_df_only_sequence['acc'] == 1]['ne_FCz'],
    nan_policy='omit'
)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,5538.0,-14.962835,12.899613,-78.514792,-22.974905,-13.931843,-5.833872,25.421979
1.0,44881.0,-6.548844,9.895709,-76.552934,-12.613723,-5.932269,0.141336,37.36886


TtestResult(statistic=-57.53063352674305, pvalue=0.0, df=50417.0)

**With aggregation within participants**

- error vs correct

In [25]:
participant_means = clean_df_only_sequence.groupby(['ID'])['ne_FCz'].mean().reset_index()
display(participant_means[['ne_FCz']].describe())

participant_means = clean_df_only_sequence.groupby(['ID', 'acc'])['ne_FCz'].mean().reset_index()
display(participant_means.groupby('acc')['ne_FCz'].describe())

ttest_rel(
    participant_means[participant_means['acc'] == -1]['ne_FCz'].to_numpy(),
    participant_means[participant_means['acc'] == 1]['ne_FCz'].to_numpy(),
    nan_policy='omit'
)

Unnamed: 0,ne_FCz
count,222.0
mean,-7.417716
std,4.14868
min,-29.68376
25%,-9.460045
50%,-7.051939
75%,-4.52573
max,0.548734


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,222.0,-15.443821,7.800369,-43.821807,-20.206744,-14.523435,-9.81754,-1.846447
1.0,222.0,-6.405409,4.111894,-28.00814,-8.640027,-6.148234,-3.785061,2.51698


TtestResult(statistic=-20.80908826533481, pvalue=5.629619349966054e-54, df=221)

#### Plot amplitudes

In [None]:
data_df_epochs = pd.read_pickle('../data/full_sample_sonata_epochs_dataset.pkl')
epochs_df = pd.merge(data_df_epochs[['epochs', 'ID', 'trial_number']], final_df, on=['ID', 'trial_number'], how='right')