In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display

## 1. Load and glance into raw data

In [2]:
# Find the file
PARENT_DIR = 'monster task data'
DATA_DIR   = 'strategic'
FILES      = os.listdir(os.path.join(PARENT_DIR, DATA_DIR))
FILENAME   = [f for f in FILES if not 'extra' in f][0]
# EXTRA      = [f for f in FILES if 'extra' in f][0]
PATH = os.path.join(PARENT_DIR, DATA_DIR, FILENAME)
# PATH_EXTRA = os.path.join(PARENT_DIR, DATA_DIR, EXTRA)

# Read into pandas DataFrame
df = pd.read_csv(PATH)
# df2 = pd.read_csv(PATH_EXTRA)
display(df.head())
# display(df2.head())

Unnamed: 0,participant:assignmentId,condition,state,trial,blockTrial,trialStartTime,monster,family,category,preferredFood,choice,correct,rt
0,A10249252O9I20MRSOBVF:3QFUFYSY9ZU71A0RA11WIB75...,1,train,1,1,1273,Squid_4_5,Squid,categoryIgnore1D,oranges,oranges,True,5954
1,A10249252O9I20MRSOBVF:3QFUFYSY9ZU71A0RA11WIB75...,1,train,2,2,8079,Squid_6_3,Squid,categoryIgnore1D,oranges,bananas,False,3075
2,A10249252O9I20MRSOBVF:3QFUFYSY9ZU71A0RA11WIB75...,1,train,3,3,12431,Squid_6_6,Squid,categoryIgnore1D,oranges,oranges,True,1201
3,A10249252O9I20MRSOBVF:3QFUFYSY9ZU71A0RA11WIB75...,1,train,4,4,14423,Squid_6_2,Squid,categoryIgnore1D,oranges,oranges,True,989
4,A10249252O9I20MRSOBVF:3QFUFYSY9ZU71A0RA11WIB75...,1,train,5,5,16188,Squid_1_2,Squid,categoryIgnore1D,bananas,bananas,True,1870


## 2. Preprocess each subject

Here we prepare the dataframe for analysis:
1. Create a hierarchical index:
    - sup_index: subject id (sid) converted from raw **participant:assignmentId**
    - sub_index: trial number
2. Add a new column encoding switch trials
3. Reformat the **blockTrial** column to be the number of trials played in the new block (simply subtract 1 from the column)
4. Parse the **monster** column into two columns encoding dimensions 1 and 2
5. Convert non-numerical values to numbers:
    - **category** (`str` to `int`):
        - category1D -> 1
        - categoryIgnore1D -> 2
        - category2D -> 3
        - categoryRandom -> 4
    - **family** (`str` to `int`):
        - Bear -> 1
        - Bunny -> 2
        - GreenMonster -> 3
        - Squid -> 4
    - **state** (`str` to `int`):
        - train -> 0
        - free -> 1  
        - test -> 2
    - **correct** (`bool` to `int`):
        - False -> 0
        - True -> 1
6. Clean up by removing unwanted columns

Finally we rearrange the columns for better presentation:
- columns 1 to 3 contain experiment variables: condition, state, blockTrial
- columns 2 to 5 contain stimulus variables: family, dimension 1, dimension 2, category (difficulty)
- columns 6 to 8 contain response variables: correct, switch, rt

The resulting data is exclusively numeric, so conversion into numpy array for calculations is trivial.

In [3]:
# Convert participant IDs to numbers
sids = df['participant:assignmentId'].unique()
df.replace(
    to_replace = sids,
    value = [i for i in range(len(sids))],
    inplace = True
)
df.loc[:,'sid'] = df['participant:assignmentId']

# Set multi-index 
df.set_index(['sid', 'trial'], inplace=True)

# Add switch column
df.loc[:,'switch'] = pd.Series(np.array(df.blockTrial==1).astype(int), index=df.index) # Find switch trials
df.loc[:,'blockTrial'] = df.blockTrial - 1

# Parse the monster characteristics (family, and the two dimensions are stored in separate cols)
df.loc[:,'D1'] = df.monster.str.split('_', expand=True)[1]
df.loc[:,'D2'] = df.monster.str.split('_', expand=True)[2]

# Convert text categories, state, and family tonumbers
df.replace(
    to_replace=['category2D', 'categoryRandom', 'categoryIgnore1D', 'category1D', 
                'train', 'free','test',
               'Bear','Bunny', 'GreenMonster','Squid',
               'bananas', 'broccoli', 'carrot', 'grilled_cheese', 'oranges', 'pancakes', 'tacos','waffles'],
    value=[3,4,2,1,
           0,1,2,
           1,2,3,4,
           1,2,3,4,5,6,7,8],
    inplace=True
)

# Convert boolean correct to int (for presentation)
df.correct = df.correct.astype(int)

# Remove unwanted columns
to_remove = ['trialStartTime','monster','preferredFood']
df.drop(to_remove, axis=1, inplace=True)
df.rename(columns={'state': 'stage'}, inplace=True)
    
# Rearrange dataframe
new_order = ['condition','stage','blockTrial',   'family','D1','D2','category',   'choice','correct','switch','rt']
df = df[new_order]

## 2+. Display new format

In [4]:
display(df.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,condition,stage,blockTrial,family,D1,D2,category,choice,correct,switch,rt
sid,trial,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1,0,0,4,4,5,2,5,1,1,5954
0,2,1,0,1,4,6,3,2,1,0,0,3075
0,3,1,0,2,4,6,6,2,5,1,0,1201
0,4,1,0,3,4,6,2,2,5,1,0,989
0,5,1,0,4,4,1,2,2,1,1,0,1870


## 3. Save clean(er) data to `clean_data` directory

In [5]:
PARENT_SAVE_DIR = 'clean_data'
SAVE_DIR = os.path.join(PARENT_SAVE_DIR, DATA_DIR)
SAVE_PATH = os.path.join(SAVE_DIR, FILENAME)

if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)

df.to_csv(
    path_or_buf = SAVE_PATH , 
    sep = ',', 
    na_rep = '', 
    header = True, 
    index = True, 
    index_label = ['sid','trial'], 
    mode = 'w', 
    line_terminator = '\n', 
)