# Feature extraction

Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import re

TODO:

To exclude:
1) percent inhibition on Stop trials less than 25% or greater than 75%;
2) percent correct responding on Go trials less than 60%;
3) percent errors (i.e., incorrect direction) on Go trials greater than 10%; and
4) SSRT estimate that was negative or less than 50 ms. Although SSRT is the primary indicator of task performance, mean and standard deviation of RT on Go trials, percent inhibition on stop trials, and percent errors on Go trials were also examined.

## Read self-report data

In [17]:
questionnaires_path = '../data/questionnaire/questionnaires_impulsivity'
questionnaires_df = pd.read_csv(
    f"{questionnaires_path}.csv", 
    dtype={"ID": "object"},
    sep=";"
)

Center and normalize questionnaire data

In [18]:
def center_and_normalize(df, exclude_col):
    # Select columns to scale
    cols_to_scale = [col for col in df.columns if col != exclude_col]
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Fit and transform the data
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df

In [71]:
questionnaires_df_scaled = center_and_normalize(questionnaires_df, 'ID')
questionnaires_df_scaled

Unnamed: 0,ID,BIS,AAC-FULL,Dickman-Dysfunctional Impulsivity,Dickman-Functional Impulsivity,Dickman-Impulsivity
0,000,0.584046,-0.712524,2.326424,1.012745,2.096673
1,001,1.172450,-0.316936,-0.768497,-0.927586,-1.092494
2,002,-0.592763,0.322986,0.846244,0.285121,0.706523
3,003,-1.965706,1.102527,-1.037621,1.376558,0.297656
4,004,-0.004359,0.578955,-0.768497,0.163850,-0.356533
...,...,...,...,...,...,...
220,220,1.368585,-0.188952,0.980806,0.406392,0.870070
221,221,0.780180,-1.224462,0.307997,-0.685045,-0.274759
222,222,0.584046,0.067017,-0.499374,1.012745,0.379429
223,223,-1.181167,0.718574,0.846244,0.042580,0.542976


In [94]:
scales = ['ID', 'BIS', 'AAC-FULL', 'Dickman-Impulsivity']
questionnaires_df_scaled = questionnaires_df_scaled[scales]
questionnaires_df_scaled

Unnamed: 0,ID,BIS,AAC-FULL,Dickman-Impulsivity
0,000,0.584046,-0.712524,2.096673
1,001,1.172450,-0.316936,-1.092494
2,002,-0.592763,0.322986,0.706523
3,003,-1.965706,1.102527,0.297656
4,004,-0.004359,0.578955,-0.356533
...,...,...,...,...
220,220,1.368585,-0.188952,0.870070
221,221,0.780180,-1.224462,-0.274759
222,222,0.584046,0.067017,0.379429
223,223,-1.181167,0.718574,0.542976


## Read  behavioral data

In [289]:
def read_participants_data(path, filter_correct=True, filter_short=False, th=0.15):
    # List to store extracted IDs
    ids = []
    behavioral_summary_df = pd.DataFrame()
    participants_all_data = pd.DataFrame()
    
    # Pattern to match filenames
    pattern = re.compile(r'beh_SST-(\d{3})')
    
    # List all files in the directory
    for filename in os.listdir(path):
        match = pattern.search(filename)
        if match:
            ids.append(match.group(1))
    ids = sorted(ids)

    for index, id in enumerate(ids):
        # read participant's behavioral data
        participant_beh_df = pd.read_csv(
            f"{path}beh_SST-{id}.csv", 
        )
        
        # filter go trials with stop
        participant_go_nostop_df = participant_beh_df[
            (participant_beh_df['ST_type'].isna()) 
        ]
        
        # create participant-level single trial dataframe
        participant_single_trial_df = pd.DataFrame()
        participant_single_trial_df['go_condition'] = participant_go_nostop_df['GO_name']
        participant_single_trial_df['rt'] = participant_go_nostop_df['RE_time']
        participant_single_trial_df['response_type'] = participant_beh_df.apply(
            lambda row: 1 if row['RE_key'] == row['RE_true'] else 0,
            axis=1
        )
        participant_single_trial_df['omission'] = (
            participant_single_trial_df['rt'] > 1.0) | (participant_single_trial_df['rt'].isna()
        )
        participant_single_trial_df['ID'] = id
        participant_single_trial_df['index'] = index + 1

        if filter_correct:
            participant_single_trial_df = participant_single_trial_df[
                participant_single_trial_df['response_type'] == 'correct'
            ]
        if filter_short:
            participant_single_trial_df = participant_single_trial_df[
                participant_single_trial_df['rt'] > th
            ]
        
        # calculate left-right ratio
        left_count = participant_single_trial_df['go_condition'].str.upper().value_counts().get('LEFT', 0)
        right_count = participant_single_trial_df['go_condition'].str.upper().value_counts().get('RIGHT', 0)
        left_right_ratio = right_count

        if (participant_single_trial_df['rt'] > 1.0).any():
            print(f"Participant {id} has {(participant_single_trial_df['rt'] > 1.0).sum()} > 1000ms trials (omisions)")

        if (participant_single_trial_df['rt'] < .1).any():
            print(f"Participant {id} has {(participant_single_trial_df['rt'] < .1).sum()} < 100ms trials (fast error)")

        accuracy_sum = participant_single_trial_df[participant_single_trial_df['omission'] == False]['response_type'].sum()
        # create a summary
        participant_data = pd.DataFrame({
            'ID': [id],
            'rt_mean': [participant_single_trial_df['rt'].mean()],
            'rt_sd': [(participant_single_trial_df['rt']).var()],
            'right_count': [right_count],
            'left_count': [left_count],
            'accuracy_sum': [accuracy_sum],
            'accuracy_sum_with_omissions': participant_single_trial_df['response_type'].sum(),
            'omissions': (participant_single_trial_df['rt'] > 1.0).sum(),
            'num_trials': [len(participant_single_trial_df)],
            'accuracy': accuracy_sum/[len(participant_single_trial_df)],
        })
        
        
        behavioral_summary_df = pd.concat([behavioral_summary_df, participant_data], ignore_index=True)
        participants_all_data = pd.concat([participants_all_data, participant_single_trial_df], ignore_index=True)

    return behavioral_summary_df, participants_all_data

In [290]:
behavioral_summary_df, participants_data_df = read_participants_data(
    '../data/behavioral/', 
    filter_correct=False,
    filter_short=True,
    th = 0.15
)
omissions = behavioral_summary_df[behavioral_summary_df['omissions'] > 0]

print(f"\n{len(omissions)} participanst has in average {omissions['omissions'].mean()} ommisions.")

Participant 030 has 1 > 1000ms trials (omisions)
Participant 036 has 1 > 1000ms trials (omisions)
Participant 119 has 1 > 1000ms trials (omisions)
Participant 174 has 1 > 1000ms trials (omisions)
Participant 176 has 1 > 1000ms trials (omisions)

5 participanst has in average 1.0 ommisions.


In [291]:
participants_data_df

Unnamed: 0,go_condition,rt,response_type,omission,ID,index
0,left,0.483555,1,False,000,1
1,right,0.300505,1,False,000,1
2,right,0.350223,1,False,000,1
3,left,0.300410,1,False,000,1
4,right,0.267293,1,False,000,1
...,...,...,...,...,...,...
72638,right,0.215547,1,False,224,225
72639,right,0.340811,1,False,224,225
72640,right,0.271382,1,False,224,225
72641,right,0.410525,1,False,224,225


## Read EEG data

In [292]:
eeg_data = pd.DataFrame()

## Concat data

In [293]:
data_summary_df = behavioral_summary_df.merge(questionnaires_df_scaled, on='ID')
data_summary_df

Unnamed: 0,ID,rt_mean,rt_sd,right_count,left_count,accuracy_sum,accuracy_sum_with_omissions,omissions,num_trials,accuracy,BIS,AAC-FULL,Dickman-Impulsivity
0,000,0.330821,0.003720,162,158,316,316,0,320,0.987500,0.584046,-0.712524,2.096673
1,001,0.425735,0.023621,161,161,322,322,0,322,1.000000,1.172450,-0.316936,-1.092494
2,002,0.383269,0.012237,162,161,314,314,0,323,0.972136,-0.592763,0.322986,0.706523
3,003,0.430480,0.017669,162,161,323,323,0,323,1.000000,-1.965706,1.102527,0.297656
4,004,0.364595,0.003932,163,162,321,321,0,325,0.987692,-0.004359,0.578955,-0.356533
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,220,0.393663,0.009792,163,162,325,325,0,325,1.000000,1.368585,-0.188952,0.870070
221,221,0.404798,0.012802,160,159,311,311,0,319,0.974922,0.780180,-1.224462,-0.274759
222,222,0.368094,0.007844,163,160,321,321,0,323,0.993808,0.584046,0.067017,0.379429
223,223,0.441018,0.014917,163,160,322,322,0,323,0.996904,-1.181167,0.718574,0.542976


Save the data

In [294]:
data_summary_df.to_csv('../data/data_summary.csv')

In [295]:
data_participants_df = participants_data_df.merge(questionnaires_df_scaled, on='ID')
data_participants_df

Unnamed: 0,go_condition,rt,response_type,omission,ID,index,BIS,AAC-FULL,Dickman-Impulsivity
0,left,0.483555,1,False,000,1,0.584046,-0.712524,2.096673
1,right,0.300505,1,False,000,1,0.584046,-0.712524,2.096673
2,right,0.350223,1,False,000,1,0.584046,-0.712524,2.096673
3,left,0.300410,1,False,000,1,0.584046,-0.712524,2.096673
4,right,0.267293,1,False,000,1,0.584046,-0.712524,2.096673
...,...,...,...,...,...,...,...,...,...
72638,right,0.215547,1,False,224,225,-0.592763,-0.840509,-0.928947
72639,right,0.340811,1,False,224,225,-0.592763,-0.840509,-0.928947
72640,right,0.271382,1,False,224,225,-0.592763,-0.840509,-0.928947
72641,right,0.410525,1,False,224,225,-0.592763,-0.840509,-0.928947


In [296]:
data_participants_df.to_csv('../data/data_participants.csv')

## Explore the data

### Correlation matrix

In [119]:
columns = data_df.columns.tolist()
columns.remove('ID')

# Compute the correlation matrix for the selected columns
correlation_matrix = data_df[columns].corr()
display(correlation_matrix)

Unnamed: 0,rt_mean,rt_sd,left_right_ratio,num_trials,BIS,AAC-FULL,Dickman-Impulsivity
rt_mean,1.0,0.807383,-0.14252,-0.159829,-0.047045,-0.047679,-0.062499
rt_sd,0.807383,1.0,-0.240754,-0.243306,-0.058964,-0.093548,0.00546
left_right_ratio,-0.14252,-0.240754,1.0,0.857043,0.052728,-0.04191,-0.140133
num_trials,-0.159829,-0.243306,0.857043,1.0,0.070657,-0.014892,-0.163932
BIS,-0.047045,-0.058964,0.052728,0.070657,1.0,-0.444691,-0.337296
AAC-FULL,-0.047679,-0.093548,-0.04191,-0.014892,-0.444691,1.0,0.272078
Dickman-Impulsivity,-0.062499,0.00546,-0.140133,-0.163932,-0.337296,0.272078,1.0
