In [5]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import janitor as jn
import warnings

# Evaluation of comprehension question responses during experiment (from PsychoPy)

In [64]:
warnings.filterwarnings('ignore')
# This turns off all warnings but aiming specifically at the Chained Assignment warning, because to my best knowledge the code should still work under pandas 3.0
# If it is not the case, remove this line of code to find out more

## Function to recode cumulative button press responses

In [68]:
def custom_diff(df, columns):

    for column in columns:
        # Prepare new column names
        new_column_name = column.replace('button_', '').replace('numClicks', 'numClicksDiff')
        # Initialize new column with zeros
        df[new_column_name] = 0

        for i in range(len(df)):
            if i == 0:
                # Keep first value unchanged...
                df[new_column_name].iloc[i] = df[column].iloc[i]
            elif df[column].iloc[i] == 0:
                # ... as well as any 0's - we want only differences if count of button presses increases
                df[new_column_name].iloc[i] = df[column].iloc[i]
            else:
                # For all other values, calculate the difference to previous row
                df[new_column_name].iloc[i] = df[column].iloc[i] - df[column].iloc[i-1]
                
    return df

## Single file analysis (Ana's preference)

In [104]:
# Set path - this should contain all files to be analyzed
# Can be from multiple participants (4+ files), or from one, which will be 4 files (2 blocks x 2 sessions)
# Output will be in 1:1 ratio (for each input file, one output file)

inputdir = "C:/Users/annas/OneDrive/Desktop/BCBL/Preprod_2 (Jupyter)/data/compquest"

# Create list of files
all_filepaths = list(Path(inputdir).rglob("*.csv"))

In [105]:
for filepath in all_filepaths:
    # Read the Excel file
    # Drop column with instructions as it will throw a tokenizing error with sep = comma
    # i.e instructions contain a comma, so the reader will attempt to split the data in more columns than exist for the rest of data
    df = pd.read_csv(filepath, usecols=lambda x: x != "text_instructions")

    # Drop all NaN rows and practice trials
    df = df[df['question'].apply(lambda x: pd.notna(x))]

    # Only keep rows with comprehension questions
    df = df[df['words'].isna()]

    # Re-code button responses (change cumulative to regular binary)
    df = custom_diff(df, ['button_v.numClicks', 'button_f.numClicks', 'button_idontknow.numClicks'])

    # Re-code button response into one column
    df['button_response'] = df.apply(lambda x: 'True' if (x['v.numClicksDiff'] == 1)
                                      else 'False' if (x['f.numClicksDiff'] == 1)
                                      else 'Idk', axis=1)

    # Determine response
    df['response'] = df.apply(lambda x: 'NaN' if (pd.isna(x['answer']))
                                      else 'Correct' if (x['v.numClicksDiff'] == 1 and x['answer'] == True)
                                      else 'Correct' if (x['f.numClicksDiff'] == 1 and x['answer'] == False)
                                      else 'Incorrect', axis=1)

    # Sanity check in case of an error, to know which file was last successfully processed
    print("Processing file:", filepath)
    
    # Select columns of interest
    df = df.loc[:, ['Subject ID', 'Language', 'Block', 'List', 'item', 'question', 'answer', 'button_response', 'response']]
    
    # Get original file name
    excel_file_name = os.path.splitext(os.path.basename(filepath))[0]
    
    # Concatenate it with _compquest to name the output and save as Excel shet
    df.to_excel(f'{excel_file_name}_compquest.xlsx', index = False)

Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\10026_1_French_TongueTapping_A1.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\10026_1_Spanish_TongueTapping_A1.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\10026_2_Spanish_SyllableProduction_B1.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\10026_4_French_SyllableProduction_B1.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\11903_1_Spanish_TongueTapping_A2.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\11903_2_Spanish_SyllableProduction_B2.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\4849_1_French_TongueTapping_A2.csv
Processing file: C:\Users\annas\OneDrive\Desktop\BCBL\Preprod_2 (Jupyter)\data\compquest\4849_2_French_Syll

## 'Mass' analysis

In [None]:
# Set path - this should contain all files to be analyzed
# Can be from multiple participants (4+ files), or from one, which will be 4 files (2 blocks x 2 sessions)
# Output will be 1 file
inputdir = "C:/Users/annas/OneDrive/Desktop/BCBL/Preprod_2 (Jupyter)/data/compquest"

# Create list of files
all_filepaths = list(Path(inputdir).rglob("*.csv"))

# Initialize an empty list to store dataframes
df_list = []

In [None]:
for filepath in all_filepaths:
    # Read the Excel file
    # Drop column with instructions as it will throw a tokenizing error with sep = comma
    # i.e instructions contain a comma, so the reader will attempt to split the data in more columns than exist for the rest of data
    df = pd.read_csv(filepath, usecols=lambda x: x != "text_instructions")

    # Drop all NaN rows and practice trials
    df = df[df['question'].apply(lambda x: pd.notna(x))]

    # Only keep rows with comprehension questions
    df = df[df['words'].isna()]

    # Re-code button responses (change cumulative to regular binary)
    df = custom_diff(df, ['button_v.numClicks', 'button_f.numClicks', 'button_idontknow.numClicks'])

    # Re-code button response into one column
    df['button_response'] = df.apply(lambda x: 'True' if (x['v.numClicksDiff'] == 1)
                                      else 'False' if (x['f.numClicksDiff'] == 1)
                                      else 'Idk', axis=1)

    # Determine response
    df['response'] = df.apply(lambda x: 'NaN' if (pd.isna(x['answer']))
                                      else 'Correct' if (x['v.numClicksDiff'] == 1 and x['answer'] == True)
                                      else 'Correct' if (x['f.numClicksDiff'] == 1 and x['answer'] == False)
                                      else 'Incorrect', axis=1)


    # Select columns of interest
    df = df.loc[:, ['Subject ID', 'Language', 'Block', 'List', 'item', 'question', 'answer', 'button_response', 'response']]
    
    # Append the dataframe to the list
    df_list.append(df)

# Combine all dataframes in the list into one dataframe
merged_df = pd.concat(df_list, ignore_index=True)

# Output
# Name file according to the first cell in Subject ID column
merged_df.to_excel(f'{merged_df.loc[merged_df.index[0], 'Subject ID']}_compquest.xlsx', index = False)

## Misc

In [None]:
# Outtakes: extended ver. of function
def custom_diff(df, columns):

    for column in columns:
        # Prepare new column names
        new_column_name = column.replace('button_', '').replace('numClicks', 'numClicksDiff')
        # Initialize new column with zeros
        df[new_column_name] = 0

        found_first_one = False  # Flag to identify the first occurrence of 1
        for i in range(len(df)):
            if df[column].iloc[i] == 1 and not found_first_one:
                # Keep the first occurrence of 1 unchanged
                df[new_column_name].iloc[i] = df[column].iloc[i]
                found_first_one = True
            elif found_first_one:
                if df[column].iloc[i] == 0:
                    # If the row contains 0, keep it as 0
                    df[new_column_name].iloc[i] = 0
                else:
                    # Calculate the difference with the previous row
                    df[new_column_name].iloc[i] = df[column].iloc[i] - df[column].iloc[i-1]
            else:
                # Before the first occurrence of 1, keep the values unchanged
                df[new_column_name].iloc[i] = df[column].iloc[i]
                
    return df

In [None]:
# Outtakes: practice trial manipulations

    # Distinguish practice and experimental trials
    df = df.case_when(
        df['rp_sentence'].notna(), 'Practice', 
        df['sentence'].notna(), 'Experiment',
        column_name = 'trial'
    )

    # Combine columns
    df['question'] = df['rp_sentence_comprehensionq'].combine_first(df['question'])
    df['words'] = df['rp_words'].combine_first(df['words'])
    df['button_v.numClicks'] = df['button_rp_v.numClicks'].combine_first(df['button_v.numClicks'])
    df['button_f.numClicks'] = df['button_rp_f.numClicks'].combine_first(df['button_f.numClicks'])
    df['button_idk.numClicks'] = df['button_rp_idk.numClicks'].combine_first(df['button_idontknow.numClicks'])