# Data structure
The subjects are forced to choose from two slot machines in first 4 trial. There are two experiments conditions:
1. horizon 1: the subjects choose freely for 1 step.
2. horizon 6: the subjects choose freely for 6 steps.

Columns:
- block: the block number
- age:
- game: there are multiple games in each block
- gameLength: the length of the game (horizon 1 or horizon 6)
- uc: uncertainty condition ( the number of times they choose option 2 (right I believe) in the forced trials)
- m1: the mean of the first slot machine
- m2: the mean of the second slot machine
- r{1~10}: the reward the subject got in each trial
- c{1~10}: the choice the subject made in each trial (the first 4 trials are forced)
- rt{1~10}: the reaction time of the subject in each trial
When below attributes are not NaN, this version of task consists of repeated games. i.e. identical forced choice trials and forced choice outcomes
- gID: the unique game id - so in a dataset with 160 games there will be 80 unique games - gID should go from 1 to 80 -
gID allows you to easily identify which games are repeats of each other.
- repeatNumber: either 1 or 2 - 1 when it's the first time they see that game, 2 when it's the second

In [16]:
import os
import time

import pandas as pd
import sys

sys.path.insert(0, os.path.abspath('..'))

In [17]:
path = "."
wanted_files = {
    "LIFESPAN_SZcontrols.csv",
    "LIFESPAN_agingAZ_all.csv",
    "LIFESPAN_sommerville.csv",
    "LIFESPAN_students.csv",
    "BATTERY_HorizonTaskFinal.csv",
    "LIFESPAN_Harms.csv",
    "LIFESPAN_Smith.csv",
    # "GOTTLIEB.csv", # online
}

files = [os.path.join(path, f) for f in os.listdir(path) 
         if f in wanted_files]
files

['.\\BATTERY_HorizonTaskFinal.csv',
 '.\\LIFESPAN_agingAZ_all.csv',
 '.\\LIFESPAN_Harms.csv',
 '.\\LIFESPAN_Smith.csv',
 '.\\LIFESPAN_sommerville.csv',
 '.\\LIFESPAN_students.csv',
 '.\\LIFESPAN_SZcontrols.csv']

In [18]:
idx = 0
raw = pd.read_csv(files[idx])
raw

Unnamed: 0,expt_name,subjectID,age,gender,subjectNumber,block,game,gameLength,uc,m1,...,rt1,rt2,rt3,rt4,rt5,rt6,rt7,rt8,rt9,rt10
0,TMS_behavior_3_20221019T110622.mat,EE003,22,1,1,1,1,10,2,60,...,0.950881,0.431133,1.157232,0.635978,4.137748,3.377787,6.534047,1.464503,11.477123,0.741042
1,TMS_behavior_3_20221019T110622.mat,EE003,22,1,1,1,2,10,3,60,...,3.159910,0.232624,0.371810,0.633875,4.499044,2.850453,2.353010,0.949093,4.791729,3.558566
2,TMS_behavior_3_20221019T110622.mat,EE003,22,1,1,1,3,10,2,28,...,2.052776,1.928397,0.373421,0.434766,1.901052,0.591958,0.814195,2.319797,0.875428,0.870943
3,TMS_behavior_3_20221019T110622.mat,EE003,22,1,1,1,4,5,1,52,...,0.810913,3.123280,0.334356,0.421026,2.146979,,,,,
4,TMS_behavior_3_20221019T110622.mat,EE003,22,1,1,1,5,5,2,40,...,0.627831,1.552514,0.451944,0.470202,10.413191,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,HORIZONBATTERY_behavior_684_20240624T094645.mat,EE684,71,3,236,4,124,10,3,28,...,0.666315,0.613335,0.946425,1.409739,3.054431,1.137758,0.601992,0.391527,0.229531,0.817063
30158,HORIZONBATTERY_behavior_684_20240624T094645.mat,EE684,71,3,236,4,125,10,2,60,...,0.739431,0.843717,1.655635,1.107467,1.557049,0.804084,0.826136,0.310140,0.447232,0.314356
30159,HORIZONBATTERY_behavior_684_20240624T094645.mat,EE684,71,3,236,4,126,10,1,80,...,0.561553,0.664907,0.780418,3.202240,1.359235,1.110035,0.783055,0.477570,0.391676,0.582062
30160,HORIZONBATTERY_behavior_684_20240624T094645.mat,EE684,71,3,236,4,127,10,2,60,...,0.657896,0.709347,1.239233,1.083144,1.261894,2.088720,1.122555,0.666807,0.397595,0.418682


In [19]:
rename_cols = {
    "subjectID": "subject",
    "gameLength": "game_length",
    "uc": "uncertainty",
    "m1": "value_option0",
    "m2": "value_option1",
    # "gID": "game_ID",
    # "repeatNumber": "repeat_number",
    "game": "block",
}

id_vars=["expt_name", "subject", "age", "gender", "block", "game_length",  "file_name", "value_option0", "value_option1", "uncertainty"]

max_trials = 10
reward_col_idx = ["r"+str(i) for i in range(1, max_trials+1)]
choice_col_idx = ["c"+str(i) for i in range(1, max_trials+1)]
rt_col_idx = ["rt"+str(i) for i in range(1, max_trials+1)]
rt_col_idx

['rt1', 'rt2', 'rt3', 'rt4', 'rt5', 'rt6', 'rt7', 'rt8', 'rt9', 'rt10']

In [20]:
# unique_ID = f'horizon_{time.time()}'
# from ruff_cm.utils import hash_string
unique_ID = "horizon_1688262697.533951"

# print(unique_ID)

In [21]:
def preprocess(data):
    data.rename(columns=rename_cols, inplace=True)

    # melt data
    reward = pd.melt(data, id_vars=id_vars, value_vars=reward_col_idx, var_name="trial", value_name="reward")
    response_time = pd.melt(data, id_vars=id_vars, value_vars=rt_col_idx, var_name="trial", value_name="RT")
    choice = pd.melt(data, id_vars=id_vars, value_vars=choice_col_idx, var_name="trial", value_name="choice")
    data = pd.concat([reward, response_time["RT"], choice["choice"]], axis=1)  # use reward as id_vars
    data["trial"] = data["trial"].apply(lambda x: int(x.replace("r", "")))  # so we remove prefix "r" in trial number
    # data["RT"] = data["RT"]   # in s
    
    data["forced"] = data["trial"].apply(lambda x: 1 if x <= 4 else 0)  # 1: forced trials, 0: free trials
    data["mask"] = data["forced"].copy()  # mask is the same as forced
    data["subject"] = data["expt_name"].astype(str) + data["subject"].astype(str)
    # data["subject"] = data["subject"].apply(lambda x: hash_string(x, unique_ID))

    # drop NaN row for horizon 6 condition
    data = data.dropna(subset=["reward", "RT", "choice"])

    # convert data type, invalid parsing will be set as NaN.
    data["age"] = pd.to_numeric(data["age"], errors='coerce', downcast='integer')
    data["gender"] = pd.to_numeric(data["gender"], errors='coerce', downcast='integer')
    # data["game_ID"] = pd.to_numeric(data["game_ID"], errors='coerce', downcast='integer')
    # data["repeat_number"] = pd.to_numeric(data["repeat_number"], errors='coerce', downcast='integer')

    data["choice"] = data["choice"].astype(int) - 1  # choice 0 or 1
    data["reward"] = data["reward"].astype(int)
    # data["game_length"] = data["game_length"].astype(int)
    data["horizon"] = data["game_length"].astype(int) - 4  # [5, 10] -> [1, 6]
    data["uncertainty"] = data["uncertainty"].astype(int) - 2  # [1, 2, 3] -> [-1, 0, 1]
    data["value_option0"] = data["value_option0"].astype(int)
    data["value_option1"] = data["value_option1"].astype(int)
    data["block"] = data["block"].astype(int) - 1
    data["trial"] = data["trial"].astype(int) - 1

    data.fillna(-99, inplace=True)  # fill NaN with -99
    data["age"] = data["age"].astype(int)  # convert to int
    data["gender"] = data["gender"].astype(int)  # convert to int
    # data["game_ID"] = data["game_ID"].astype(int)  # convert to int
    # data["repeat_number"] = data["repeat_number"].astype(int)  # convert to int
    return data

In [22]:
data = pd.DataFrame()

for file in files:
    print(f"Processing file: {file}")
    raw = pd.read_csv(file)
    raw.drop(columns=["block", "gID", "repeatNumber"], inplace=True)
    raw["file_name"] = os.path.basename(file)
    study_data = preprocess(raw)
    data = pd.concat([data, study_data], axis=0)

data.sort_values(by=["expt_name", "subject", "block", "trial"], inplace=True)
data["subject"] = data["subject"].astype("category").cat.codes
data.drop(columns=["expt_name", "game_length"], inplace=True)
data

Processing file: .\BATTERY_HorizonTaskFinal.csv
Processing file: .\LIFESPAN_agingAZ_all.csv
Processing file: .\LIFESPAN_Harms.csv
Processing file: .\LIFESPAN_Smith.csv
Processing file: .\LIFESPAN_sommerville.csv
Processing file: .\LIFESPAN_students.csv
Processing file: .\LIFESPAN_SZcontrols.csv


Unnamed: 0,subject,age,gender,block,file_name,value_option0,value_option1,uncertainty,trial,reward,RT,choice,forced,mask,horizon
27520,0,18,-99,0,LIFESPAN_students.csv,40,36,1,0,42,184.143661,0,1,1,6
89959,0,18,-99,0,LIFESPAN_students.csv,40,36,1,1,45,4.635614,1,1,1,6
152398,0,18,-99,0,LIFESPAN_students.csv,40,36,1,2,42,1.122532,1,1,1,6
214837,0,18,-99,0,LIFESPAN_students.csv,40,36,1,3,18,2.833540,1,1,1,6
277276,0,18,-99,0,LIFESPAN_students.csv,40,36,1,4,36,2.738855,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323234,1391,-99,-99,159,LIFESPAN_students.csv,40,28,0,5,45,0.078302,1,0,0,6
385673,1391,-99,-99,159,LIFESPAN_students.csv,40,28,0,6,33,0.088727,0,0,0,6
448112,1391,-99,-99,159,LIFESPAN_students.csv,40,28,0,7,45,0.208424,0,0,0,6
510551,1391,-99,-99,159,LIFESPAN_students.csv,40,28,0,8,39,0.109955,0,0,0,6


In [23]:
# raw = pd.read_csv(file)
# raw

In [24]:
# data.to_csv("./exp.csv", index=False)

## Reject bad subjects

In [25]:
c1 = []
c2 = []
c3 = []
c4 = []
subjects_to_drop = []


for sub in data['subject'].unique():
    sub_data_all = data[data['subject'] == sub]
    sub_data = sub_data_all[sub_data_all['forced'] == 0]
    mean_reward = sub_data['reward'].mean()
    choice_autocorr = sub_data['choice'].autocorr(lag=1)
    rt = sub_data['RT']

    criterion1 = (len(rt[rt < 0.03]) / len(sub_data)) > 0.2
    criterion2 = mean_reward < 52
    criterion3 = choice_autocorr > 0.85 or choice_autocorr < 0.15
    criterion4 = abs(0.5 - (sub_data['choice'] == 1).astype(float).mean()) > 0.15
    criteria_sum = int(criterion1) + int(criterion2) + int(criterion3) + int(criterion4)
    if criterion1:
        c1.append(sub)
    if criterion2:
        c2.append(sub)
    if criterion3:
        c3.append(sub)
    if criterion4:
        c4.append(sub)
    if criteria_sum >= 3:
        subjects_to_drop.append(sub)

In [26]:
print(len(c1), len(c2), len(c3), len(c4))

190 236 332 42


In [27]:
print(len(data['subject'].unique()))
data = data[~data['subject'].isin(subjects_to_drop)]
print(len(data['subject'].unique()))

1392
1352


In [28]:
data.to_csv("./exp_all_filenames.csv", index=False)