In [43]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import janitor as jn

In [44]:
# Set path - this should contain all files to be analyzed
inputdir = "C:/Users/annas/OneDrive/Desktop/BCBL/Preprod_2 (Pilot)/data"

# Create list of files
all_filepaths = list(Path(inputdir).rglob("*.csv"))

# Initialize an empty list to store dataframes
df_list = []

In [45]:
def custom_diff(df, columns):

    for column in columns:
        # Prepare new column names
        new_column_name = column.replace('button_', '').replace('numClicks', 'numClicksDiff')
        # Initialize new column with zeros
        df[new_column_name] = 0

        for i in range(len(df)):
            if i == 0:
                # Keep first value unchanged...
                df[new_column_name].iloc[i] = df[column].iloc[i]
            elif df[column].iloc[i] == 0:
                # ... as well as any 0's - we want only differences if count of button presses increases
                df[new_column_name].iloc[i] = df[column].iloc[i]
            else:
                # For all other values, calculate the difference to previous row
                df[new_column_name].iloc[i] = df[column].iloc[i] - df[column].iloc[i-1]
                
    return df

In [46]:
for filepath in all_filepaths:
    # Read the Excel file
    # Drop column with instructions as it will throw a tokenizing error with sep = comma
    # i.e instructions contain a comma, so the reader will attempt to split the data in more columns than exist for the rest of data
    df = pd.read_csv(filepath, usecols=lambda x: x != "text_instructions")

    # Drop remaining row with instructions
    df = df[df['item'].apply(lambda x: str(x).isalnum())]

    # Drop all NaN rows and practice trials
    df = df[df['question'].apply(lambda x: pd.notna(x))]

    # Only keep rows with comprehension questions
    df = df[df['words'].isna()]

    # Re-code button responses (change cumulative to regular binary)
    df = custom_diff(df, ['button_v.numClicks', 'button_f.numClicks', 'button_idontknow.numClicks'])

    # Re-code button response into one column
    df['button_response'] = df.apply(lambda x: 'True' if (x['v.numClicksDiff'] == 1)
                                      else 'False' if (x['f.numClicksDiff'] == 1)
                                      else 'Idk', axis=1)

    # Determine response
    df['response'] = df.apply(lambda x: 'NaN' if (pd.isna(x['answer']))
                                      else 'Correct' if (x['v.numClicksDiff'] == 1 and x['answer'] == True)
                                      else 'Correct' if (x['f.numClicksDiff'] == 1 and x['answer'] == False)
                                      else 'Incorrect', axis=1)


    # Select columns of interest - clean
    df = df[['Subject ID', 'Language', 'Block', 'List', 'item', 'question', 'answer', 'button_response', 'response', ]]
    
    # Append the dataframe to the list
    df_list.append(df)

# Combine all dataframes in the list into one dataframe
merged_df = pd.concat(df_list, ignore_index=True)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[new_column_name].iloc[i] = df[column].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column_

In [47]:
merged_df

Unnamed: 0,Subject ID,Language,Block,List,item,question,answer,button_response,response
0,cristina,French,TongueTapping,A1,7,Elle a probablement besoin de lunettes.,True,True,Correct
1,cristina,French,TongueTapping,A1,90,Elle a acheté un livre.,False,False,Correct
2,cristina,French,TongueTapping,A1,96,Il a trouvé un tigre.,False,False,Correct
3,cristina,French,TongueTapping,A1,54,"À l'automne, les routes étaient fermées.",False,False,Correct
4,cristina,French,TongueTapping,A1,10,Elle n'était pas sortie de la maison.,False,False,Correct
...,...,...,...,...,...,...,...,...,...
115,cristina,French,SyllableProduction,B1,20,La moto allait tomber en panne d'essence.,False,Idk,Incorrect
116,cristina,French,SyllableProduction,B1,22,Le bâtiment n'a que deux étages.,False,False,Correct
117,cristina,French,SyllableProduction,B1,23,L'endroit était très proche.,True,True,Correct
118,cristina,French,SyllableProduction,B1,56,"Avant, elle cuisinait assise.",False,False,Correct


In [None]:
# Custom_diff: extended ver.
def custom_diff(df, columns):

    for column in columns:
        # Prepare new column names
        new_column_name = column.replace('button_', '').replace('numClicks', 'numClicksDiff')
        # Initialize new column with zeros
        df[new_column_name] = 0

        found_first_one = False  # Flag to identify the first occurrence of 1
        for i in range(len(df)):
            if df[column].iloc[i] == 1 and not found_first_one:
                # Keep the first occurrence of 1 unchanged
                df[new_column_name].iloc[i] = df[column].iloc[i]
                found_first_one = True
            elif found_first_one:
                if df[column].iloc[i] == 0:
                    # If the row contains 0, keep it as 0
                    df[new_column_name].iloc[i] = 0
                else:
                    # Calculate the difference with the previous row
                    df[new_column_name].iloc[i] = df[column].iloc[i] - df[column].iloc[i-1]
            else:
                # Before the first occurrence of 1, keep the values unchanged
                df[new_column_name].iloc[i] = df[column].iloc[i]
                
    return df

In [None]:
# Outtakes: practice trial manipulations

    # Distinguish practice and experimental trials
    df = df.case_when(
        df['rp_sentence'].notna(), 'Practice', 
        df['sentence'].notna(), 'Experiment',
        column_name = 'trial'
    )

    # Combine columns
    df['question'] = df['rp_sentence_comprehensionq'].combine_first(df['question'])
    df['words'] = df['rp_words'].combine_first(df['words'])
    df['button_v.numClicks'] = df['button_rp_v.numClicks'].combine_first(df['button_v.numClicks'])
    df['button_f.numClicks'] = df['button_rp_f.numClicks'].combine_first(df['button_f.numClicks'])
    df['button_idk.numClicks'] = df['button_rp_idk.numClicks'].combine_first(df['button_idontknow.numClicks'])