# Current dataset - Data preparation

Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import truncnorm, beta, norm, uniform
import matplotlib.pyplot as plt
import os
import json

### 1. Read EEG and behavioral data

In [None]:
paradigm = 'FLA'
case = 'RE'

data_path = f'../data/beh_eeg_{paradigm}/{paradigm}/preprocessed/wavelets_th-045/'
data_df = pd.DataFrame()

id_list = [item.split('.')[0] for item in os.listdir(data_path)]

for participant_id in id_list:
    participant_data_df = pd.read_pickle(f'{data_path}{participant_id}.pkl')
    participant_data_df['ID'] = participant_id.split("_")[1]
    
    # Create a DataFrame with the complete range of trial indices (1 to 300)
    complete_trials = pd.DataFrame({'trial number': range(1, 301), 'ID': [participant_id.split("_")[1]]*300})

    # Merge the original DataFrame with the complete_trials DataFrame
    merged_df = pd.merge(complete_trials, participant_data_df, on=['trial number', 'ID'], how='left')
    
    data_df = pd.concat([data_df, merged_df], ignore_index=True)

### 2. Featurize ERN: peak amplitude at FCz
- EEG column in bad trials will be filled with NaN

In [None]:
channel = 'FCz'

data_df[f'ne_{channel}'] = data_df.apply(
    lambda row: np.nan if pd.isna(row['drop_log']) or channel in row['drop_log'] else np.min(row['epoch'].get_data(picks=channel, tmin=0.0, tmax=0.1).flatten()),
    axis=1
)

data_df[f'ne_mean_{channel}'] = data_df.apply(
    lambda row: np.nan if pd.isna(row['drop_log']) or channel in row['drop_log'] else np.mean(row['epoch'].get_data(picks=channel, tmin=0.0, tmax=0.1).flatten()),
    axis=1
)

channel = 'Fz'

data_df[f'ne_{channel}'] = data_df.apply(
    lambda row: np.nan if pd.isna(row['drop_log']) or channel in row['drop_log'] else np.min(row['epoch'].get_data(picks=channel, tmin=0.0, tmax=0.1).flatten()),
    axis=1
)

data_df[f'ne_mean_{channel}'] = data_df.apply(
    lambda row: np.nan if pd.isna(row['drop_log']) or channel in row['drop_log'] else np.mean(row['epoch'].get_data(picks=channel, tmin=0.0, tmax=0.1).flatten()),
    axis=1
)

In [None]:
# drop column with epochs to better display df
columns_to_drop = ['epoch']
data_df = data_df.drop(columns=columns_to_drop)

Save full sample

In [None]:
data_df.to_pickle('full_sample_current_dataset.pkl')

### 3. Mark bad participants (less than 6 trials)

In [None]:
cleared_data_df = data_df.copy(deep=True)

ids = np.unique(data_df['ID'])
n_clear_trails = 6

for participant_id in ids:
    participant_data = data_df[data_df['ID'] == participant_id]['ne_FCz'].to_numpy()
    good_trials = np.count_nonzero(~np.isnan(participant_data.flatten()))
    
    if good_trials < n_clear_trails:
        print(f"Participant {participant_id} has {good_trials} good trial. Rejecting")
        cleared_data_df = cleared_data_df[cleared_data_df['ID'] != participant_id]        
    
cleared_data_df = cleared_data_df.reset_index()
cleared_ids = np.unique(cleared_data_df['ID'])

### 4. Mark participants who have less than 6 error trials

In [None]:
ids = np.unique(cleared_data_df['ID'])
n_error = 6

for participant_id in ids:
    participant_data = cleared_data_df[cleared_data_df['ID'] == participant_id]
    error_trials = participant_data[(participant_data['reaction'] == 'incorrect') & 
                                    (participant_data['ne_FCz'].notna())]
    
    if len(error_trials) < n_error:
        print(f"Participant {participant_id} has {len(error_trials)} erroneous trial. Rejecting")
        cleared_data_df = cleared_data_df[cleared_data_df['ID'] != participant_id]

cleared_data_df = cleared_data_df.reset_index()
cleared_ids = np.unique(cleared_data_df['ID'])

Rename columns

In [None]:
columns_name_mapping = {'trial number': 'trial_number'}
cleared_data_df = cleared_data_df.rename(columns=columns_name_mapping)
cleared_data_df.head()

### 5. Apply trial selection

In [None]:
# Function to check if rt exceeds 3*std
def log_threshold(row, participant_means, participant_std):
    participant_id = row['ID']
    mean_log_rt = participant_means[participant_id]
    std_log_rt = participant_std[participant_id]
    upper_bound = mean_log_rt + 3*std_log_rt
    lower_bound = mean_log_rt - 3*std_log_rt
    
    return (row['log_rt'] > upper_bound or row['log_rt'] < lower_bound), np.exp(upper_bound), np.exp(lower_bound)

final_df = pd.DataFrame()

# create mapping IDs to ordinal 1-N number
id_mapping = {old_id: new_id for new_id, old_id in enumerate(cleared_data_df['ID'].unique(), start=1)}

# cast eeg to uV from V
cleared_data_df['ne_Fz'] = cleared_data_df['ne_Fz'] * 1000000
cleared_data_df['ne_mean_Fz'] = cleared_data_df['ne_mean_Fz'] * 1000000
cleared_data_df['ne_FCz'] = cleared_data_df['ne_FCz'] * 1000000
cleared_data_df['ne_mean_FCz'] = cleared_data_df['ne_mean_FCz'] * 1000000

# add column with numerical indicator of accuracy
cleared_data_df['acc'] = np.select(
    [cleared_data_df['reaction'] == 'incorrect', cleared_data_df['reaction'] == 'correct', pd.isna(cleared_data_df['reaction'])],
    [-1, 1, np.nan],
    default=np.nan
)

# add column with numerical indicator of condition
cleared_data_df['condition'] = np.select(
    [cleared_data_df['trial_type'] == 'congruent', cleared_data_df['trial_type'] == 'incongruent', pd.isna(cleared_data_df['trial_type'])],
    [1, -1, np.nan],
    default=np.nan
)

# add column with condition index
cleared_data_df['condition_index'] = np.select(
    [cleared_data_df['trial_type'] == 'congruent', cleared_data_df['trial_type'] == 'incongruent', pd.isna(cleared_data_df['trial_type'])],
    [1, 2, np.nan],
    default=np.nan
)

# add column with pre accuracy and pre eeg info
cleared_data_df['pre_acc'] = cleared_data_df['acc'].shift(1)
cleared_data_df['pre_ne_Fz'] = cleared_data_df['ne_Fz'].shift(1)
cleared_data_df['pre_ne_FCz'] = cleared_data_df['ne_FCz'].shift(1)
cleared_data_df['pre_ne_mean_Fz'] = cleared_data_df['ne_mean_Fz'].shift(1)
cleared_data_df['pre_ne_mean_FCz'] = cleared_data_df['ne_mean_FCz'].shift(1)

# mark trials where rt exceed 1s
cleared_data_df['rt_greater_than_1'] = cleared_data_df.apply(
    lambda row: (True if row['rt'] > 1 else False),
    axis=1
)

# Calculate the mean(log(rt)) for each participant
cleared_data_df['log_rt'] = np.log(cleared_data_df['rt'])
participant_means = cleared_data_df.groupby('ID')['log_rt'].mean()
participant_std = cleared_data_df.groupby('ID')['log_rt'].std()

# mark trials that exceed rt threshold - participant-wise
cleared_data_df['log_rt_exceed_threshold'], cleared_data_df['log_rt_exceed_ub'], cleared_data_df['log_rt_exceed_lb'] = zip(*cleared_data_df.apply(log_threshold, axis=1, args=(participant_means, participant_std)))


global_log_rt = np.log(cleared_data_df['rt'].to_numpy())
rt_upper_bound = np.nanmean(global_log_rt) + 3*np.nanstd(global_log_rt)
rt_lower_bound = np.nanmean(global_log_rt) - 3*np.nanstd(global_log_rt)

print(f"Group-level RT upper bound: {np.exp(rt_upper_bound)}, RT lower bound: {np.exp(rt_lower_bound)}")

# mark trials that exceed rt threshold - globally
cleared_data_df['global_log_rt_exceed_threshold'] = cleared_data_df.apply(
    lambda row: (True if np.log(row['rt']) > rt_upper_bound or np.log(row['rt']) < rt_lower_bound else False),
    axis=1
)

# Set 'rt' and 'reaction' to NaN if rt exceed 1s 
cleared_data_df.loc[cleared_data_df['rt_greater_than_1'] == True, ['rt', 'reaction']] = np.nan

for idx, row in cleared_data_df.iterrows():
    # mark if trial P is in CCX(P) sequence
    if idx-3 >= 0:
        if ((cleared_data_df.iloc[idx-3]['reaction'] == 'correct') and 
            (cleared_data_df.iloc[idx-2]['reaction'] == 'correct') and
            (not pd.isna(cleared_data_df.iloc[idx-1]['ne_Fz'])) and
            (not pd.isna(cleared_data_df.iloc[idx-1]['reaction'])) and
            (not cleared_data_df.iloc[idx]['global_log_rt_exceed_threshold']) and
            (not pd.isna(cleared_data_df.iloc[idx]['rt']))):
                is_in_sequence = True
        else:
            is_in_sequence = False
    else:
        is_in_sequence = False

    row_data = pd.DataFrame({
        'trial_number': [row['trial_number']],
        'ID': int(row['ID'].split("-")[-1]),
        'participant_index': [id_mapping[row['ID']]],
        'condition': row['condition'],
        'condition_index': [row['condition_index']],
        'rt': row['rt'],
        'acc': row['acc'],
        'ne_Fz': row['ne_Fz'],
        'ne_FCz': row['ne_FCz'],
        'ne_mean_Fz': row['ne_mean_Fz'],
        'ne_mean_FCz': row['ne_mean_FCz'],
        'y':  row['rt'] *  row['acc'],
        'pre_ne_Fz': row['pre_ne_Fz'],
        'pre_ne_FCz': row['pre_ne_FCz'],
        'pre_ne_mean_Fz': row['pre_ne_mean_Fz'],
        'pre_ne_mean_FCz': row['pre_ne_mean_FCz'],
        'pre_acc': row['pre_acc'],
        'rt_greater_than_1': row['rt_greater_than_1'],
        'log_rt_exceed_threshold': row['global_log_rt_exceed_threshold'],
        'is_in_sequence': is_in_sequence,
    })



    final_df = pd.concat([final_df, row_data], ignore_index=True)

final_df.head()

### 6. Center and standarize eeg signal participant-wise

In [None]:
def standardize(series):
    return (series - series.mean()) / series.std()

final_df['pre_ne_FCz_standarized'] = final_df.groupby('ID')['pre_ne_FCz'].transform(standardize)

# check the results of standardization
final_df.groupby('ID').describe()['pre_ne_FCz_standarized']

### 7. Create json file for Stan

Leave only selected trials

In [None]:
# remove trials with Nans
df_no_nans = final_df.copy().dropna()

# leave trials that are in CCXP sequence
df_only_sequence = df_no_nans[df_no_nans['is_in_sequence'] == True]

Save data to pkl and csv

In [None]:
# save the dataframe
df_only_sequence.to_pickle('../data/current_dataset/sonata_data_standardized.pkl')
df_only_sequence.to_csv('../data/current_dataset/sonata_data_standardized.csv')

Create json file for Stan

In [None]:
y = df_only_sequence['y'].to_list()
condition = df_only_sequence['condition'].to_list()
pre_acc = df_only_sequence['pre_acc'].to_list()
pre_ne = df_only_sequence['pre_ne_FCz_standarized'].to_list()
participant_index = df_only_sequence['participant_index'].to_list()

n_participants = len(np.unique(participant_index))
n_conditions = len(np.unique(condition))

participants_trials_slices = []
for index in np.unique(participant_index):
    indices = np.where(participant_index == index)[0]
    start_index = int(indices[0] + 1)
    end_index = int(indices[-1] + 1)
    participants_trials_slices.append([start_index, end_index])

# json
data_2d = {
    "N": len(y),
    "participants_trials_slices": participants_trials_slices,
    "n_conditions": n_conditions,
    "n_participants": n_participants,
    "y": y,
    "condition": condition,
    'pre_ne': pre_ne,
    'pre_acc': pre_acc,
    "participant": participant_index
}

Save data to json

In [None]:
with open("../../data/current_dataset/sonata_data_standardized.json", "w") as outfile: 
    json.dump(data_2d, outfile)

---

## Checks

### 1. Analyse the impact of trial selection

In [None]:
mean_log_bad_trials = np.mean(cleared_data_df[cleared_data_df['log_rt_exceed_threshold'] == True].groupby('ID')['rt'].count().to_numpy().flatten())
mean_global_log_bad_trials = np.mean(cleared_data_df[cleared_data_df['global_log_rt_exceed_threshold'] == True].groupby('ID')['rt'].count().to_numpy().flatten())
global_log_bad_trials = cleared_data_df[cleared_data_df['global_log_rt_exceed_threshold'] == True].groupby('ID')['rt'].count()

print(f'Average number of log(rt) exceeding the threshold per participant: {mean_log_bad_trials}')
print(f'Average number of log(rt) exceeding the threshold per participant with global th: {mean_global_log_bad_trials}')

In [None]:
grouped_data = test_df[test_df['is_in_sequence'] == True].groupby(['ID', 'acc'])['rt'].count()
group_with_error = grouped_data[grouped_data.index.get_level_values('acc') == -1]
group_with_correct = grouped_data[grouped_data.index.get_level_values('acc') == 1]

print(f'Average number of trials per participant with CCXP:\n   error: {np.mean(group_with_error.reset_index()["rt"].to_numpy())}\n   correct:{np.mean(group_with_correct.reset_index()["rt"].to_numpy())}')

grouped_data_no_seq = test_df.groupby(['ID', 'acc'])['rt'].count()
group_with_error_no_seq = grouped_data_no_seq[grouped_data_no_seq.index.get_level_values('acc') == -1]
group_with_correct_no_seq = grouped_data_no_seq[grouped_data_no_seq.index.get_level_values('acc') == 1]

print(f'Average number of trials per participant without CCXP:\n   error: {np.mean(group_with_error_no_seq.reset_index()["rt"].to_numpy())}\n   correct:{np.mean(group_with_correct_no_seq.reset_index()["rt"].to_numpy())}')

In [None]:
cleared_data_df.groupby('ID')[['log_rt_exceed_lb','log_rt_exceed_ub' ]].mean()

#### Display thresholds

- participant-wise threshold, e.g., ID = 170

In [None]:
sns.histplot(cleared_data_df[cleared_data_df['ID'] == 'FLA-170']['rt'].to_numpy())
m = np.nanmean(np.log(cleared_data_df[cleared_data_df['ID'] == 'FLA-170']['rt'].to_numpy()))
sd = np.nanstd(np.log(cleared_data_df[cleared_data_df['ID'] == 'FLA-170']['rt'].to_numpy()))
ub = np.exp(m + 3*sd)
lb = np.exp(m - 3*sd)

plt.axvline(x=ub, c='red')
plt.axvline(x=lb, c='red')
plt.annotate(f'{round(lb,2)}', (lb, 45))
plt.annotate(f'{round(ub,2)}', (ub, 45))

plt.show()

- global threshold

In [None]:
sns.histplot(cleared_data_df['rt'].to_numpy())
m = np.nanmean(np.log(cleared_data_df['rt'].to_numpy()))
sd = np.nanstd(np.log(cleared_data_df['rt'].to_numpy()))
ub = np.exp(m + 3*sd)
lb = np.exp(m - 3*sd)

plt.axvline(x=ub, c='red')
plt.axvline(x=lb, c='red')
plt.annotate(f'{round(lb,2)}', (lb, 45))
plt.annotate(f'{round(ub,2)}', (ub, 45))

plt.show()

### 2. Post-error adaptation in RT

Group-level

In [None]:
sns.histplot(
    test_df,
    x='rt',
    hue=test_df[['condition', 'pre_acc']].apply(tuple, axis=1),
)

Participant-level

In [None]:
g = sns.FacetGrid(
    cleared_data_df.sort_values(['ID']),
    col="ID",
    col_wrap=3,
    sharex=False,
    sharey=False,
    aspect=2,
)

g.map_dataframe(
    sns.histplot,
    x="rt",
    hue='pre_response',
    kde=True,
    palette='colorblind'
)

In [None]:
print(f"Mean post error RT: {np.mean(test_df[test_df['pre_acc'] == -1]['rt'])}")
print(f"Mean post correct RT: {np.mean(test_df[test_df['pre_acc'] == 1]['rt'])}")

print(f"Mean incongruent post error RT: {np.mean(test_df[(test_df['pre_acc'] == -1) & (test_df['condition'] == -1)]['rt'])}")
print(f"Mean incongruent post correct RT: {np.mean(test_df[(test_df['pre_acc'] == 1) & (test_df['condition'] == -1)]['rt'])}")

print(f"Mean congruent post error RT: {np.mean(test_df[(test_df['pre_acc'] == -1) & (test_df['condition'] == 1)]['rt'])}")
print(f"Mean congruent post correct RT: {np.mean(test_df[(test_df['pre_acc'] == 1) & (test_df['condition'] == 1)]['rt'])}")

In [None]:
from scipy.stats import ttest_ind

print(f"post error vs post correct : {ttest_ind(test_df[test_df['pre_acc'] == -1]['rt'], test_df[test_df['pre_acc'] == 1]['rt'])}")
print(f"post error vs post correct in incongruent: {ttest_ind(test_df[(test_df['pre_acc'] == -1) & (test_df['condition'] == -1)]['rt'], test_df[(test_df['pre_acc'] == 1) & (test_df['condition'] == -1)]['rt'])}")

print(f"post error vs post correct in congruent: {ttest_ind(test_df[(test_df['pre_acc'] == -1) & (test_df['condition'] == 1)]['rt'], test_df[(test_df['pre_acc'] == 1) & (test_df['condition'] == 1)]['rt'])}")


#### Test post-response adaptation and links with pre-trial accuracy and brain signal

In [None]:
import statsmodels.formula.api as smf

test_df2 = test_df.dropna()
test_df2['rt'] = test_df2['rt']*1000
test_df2['pre_ne_FCz_centered'] = test_df2['pre_ne_FCz'] - np.mean(test_df2['pre_ne_FCz'])

mod = smf.ols(formula='rt ~ pre_ne_FCz_centered*pre_acc', data=test_df2)
res = mod.fit()

print(res.summary())