# Data structure
The subjects are forced to choose from two slot machines in first 4 trial. There are two experiments conditions:
1. horizon 1: the subjects choose freely for 1 step.
2. horizon 6: the subjects choose freely for 6 steps.

Columns:
- block: the block number
- age:
- game: there are multiple games in each block
- gameLength: the length of the game (horizon 1 or horizon 6)
- uc: uncertainty condition ( the number of times they choose option 2 (right I believe) in the forced trials)
- m1: the mean of the first slot machine
- m2: the mean of the second slot machine
- r{1~10}: the reward the subject got in each trial
- c{1~10}: the choice the subject made in each trial (the first 4 trials are forced)
- rt{1~10}: the reaction time of the subject in each trial
When below attributes are not NaN, this version of task consists of repeated games. i.e. identical forced choice trials and forced choice outcomes
- gID: the unique game id - so in a dataset with 160 games there will be 80 unique games - gID should go from 1 to 80 -
gID allows you to easily identify which games are repeats of each other.
- repeatNumber: either 1 or 2 - 1 when it's the first time they see that game, 2 when it's the second

In [43]:
import os
import time

import pandas as pd
import sys

sys.path.insert(0, os.path.abspath('..'))

In [44]:
path = "."
wanted_files = {
    # "BATTERY_HorizonTaskFinal.csv",
    # "LIFESPAN_Harms.csv",
    # "LIFESPAN_Smith.csv",
    "GOTTLIEB.csv",
}

files = [os.path.join(path, f) for f in os.listdir(path) 
         if f in wanted_files]
files

['.\\GOTTLIEB.csv']

In [45]:
idx = 0
raw = pd.read_csv(files[idx])
raw

Unnamed: 0,expt_name,subjectID,age,ageGroup,gender,genderText,education,subjectNumber,block,game,...,rt1,rt2,rt3,rt4,rt5,rt6,rt7,rt8,rt9,rt10
0,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,1,...,1953.3805,1568.3188,1184.9870,2635.0035,901.6503,,,,,
1,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,2,...,883.3246,835.0115,934.9790,1884.9959,4418.3453,1035.0187,666.6861,3151.6226,1185.0118,918.3300
2,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,3,...,1166.6223,1418.3513,833.3139,2918.3260,901.6094,,,,,
3,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,4,...,1033.3192,849.9720,1301.5922,3118.2715,2185.0424,1918.3634,3018.1861,1568.3512,901.6707,950.0144
4,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,5,...,949.9674,885.0107,818.3631,2434.9241,1983.3619,1235.0213,2484.9609,2535.0606,851.6448,584.9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109114,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,156,...,599.8992,516.8944,683.0779,599.9543,1034.0029,933.6416,466.9264,533.6191,383.6485,
109115,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,157,...,666.2068,599.3662,783.8645,533.9554,,,,,,
109116,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,158,...,683.5966,733.3335,749.4404,1316.8962,,,,,,
109117,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,159,...,684.0670,599.9653,632.3177,749.5613,749.7587,383.4958,316.8175,433.7874,450.5013,


In [46]:
rename_cols = {
    "subjectID": "subject",
    "gameLength": "game_length",
    "uc": "uncertainty",
    "m1": "value_option0",
    "m2": "value_option1",
    # "gID": "game_ID",
    # "repeatNumber": "repeat_number",
    "game": "block",
}

id_vars=["expt_name", "subject", "age", "gender", "block", "game_length",  "value_option0", "value_option1", "uncertainty"]

max_trials = 10
reward_col_idx = ["r"+str(i) for i in range(1, max_trials+1)]
choice_col_idx = ["c"+str(i) for i in range(1, max_trials+1)]
rt_col_idx = ["rt"+str(i) for i in range(1, max_trials+1)]
rt_col_idx

['rt1', 'rt2', 'rt3', 'rt4', 'rt5', 'rt6', 'rt7', 'rt8', 'rt9', 'rt10']

In [47]:
# unique_ID = f'horizon_{time.time()}'
# from ruff_cm.utils import hash_string
unique_ID = "horizon_1688262697.533951"

# print(unique_ID)

In [48]:
def preprocess(data):
    data.rename(columns=rename_cols, inplace=True)

    # melt data
    reward = pd.melt(data, id_vars=id_vars, value_vars=reward_col_idx, var_name="trial", value_name="reward")
    response_time = pd.melt(data, id_vars=id_vars, value_vars=rt_col_idx, var_name="trial", value_name="RT")
    choice = pd.melt(data, id_vars=id_vars, value_vars=choice_col_idx, var_name="trial", value_name="choice")
    data = pd.concat([reward, response_time["RT"], choice["choice"]], axis=1)  # use reward as id_vars
    data["trial"] = data["trial"].apply(lambda x: int(x.replace("r", "")))  # so we remove prefix "r" in trial number
    # data["RT"] = data["RT"]   # in s
    
    data["forced"] = data["trial"].apply(lambda x: 1 if x <= 4 else 0)  # 1: forced trials, 0: free trials
    data["mask"] = data["forced"].copy()  # mask is the same as forced
    data["subject"] = data["expt_name"].astype(str) + data["subject"].astype(str)
    # data["subject"] = data["subject"].apply(lambda x: hash_string(x, unique_ID))

    # drop NaN row for horizon 6 condition
    data = data.dropna(subset=["reward", "RT", "choice"])

    # convert data type, invalid parsing will be set as NaN.
    data["age"] = pd.to_numeric(data["age"], errors='coerce', downcast='integer')
    data["gender"] = pd.to_numeric(data["gender"], errors='coerce', downcast='integer')
    # data["game_ID"] = pd.to_numeric(data["game_ID"], errors='coerce', downcast='integer')
    # data["repeat_number"] = pd.to_numeric(data["repeat_number"], errors='coerce', downcast='integer')

    data["choice"] = data["choice"].astype(int) - 1  # choice 0 or 1
    data["reward"] = data["reward"].astype(int)
    # data["game_length"] = data["game_length"].astype(int)
    data["horizon"] = data["game_length"].astype(int) - 4  # [5, 10] -> [1, 6]
    data["uncertainty"] = data["uncertainty"].astype(int) - 2  # [1, 2, 3] -> [-1, 0, 1]
    data["value_option0"] = data["value_option0"].astype(int)
    data["value_option1"] = data["value_option1"].astype(int)
    data["block"] = data["block"].astype(int) - 1
    data["trial"] = data["trial"].astype(int) - 1

    data.fillna(-99, inplace=True)  # fill NaN with -99
    data["age"] = data["age"].astype(int)  # convert to int
    data["gender"] = data["gender"].astype(int)  # convert to int
    # data["game_ID"] = data["game_ID"].astype(int)  # convert to int
    # data["repeat_number"] = data["repeat_number"].astype(int)  # convert to int
    return data

In [49]:
data = pd.DataFrame()

for file in files:
    raw = pd.read_csv(file)
    raw.drop(columns=["block", "gID", "repeatNumber"], inplace=True)
    study_data = preprocess(raw)
    data = pd.concat([data, study_data], axis=0)

data.sort_values(by=["expt_name", "subject", "block", "trial"], inplace=True)
data["subject"] = data["subject"].astype("category").cat.codes
data.drop(columns=["expt_name", "game_length"], inplace=True)
data

Unnamed: 0,subject,age,gender,block,value_option0,value_option1,uncertainty,trial,reward,RT,choice,forced,mask,horizon
7947,0,33,1,0,28,6,1,0,42,3535.2478,0,1,1,1
7952,0,33,1,0,48,12,0,0,62,1033.4077,0,1,1,6
117066,0,33,1,0,28,6,1,1,41,1650.1077,0,1,1,1
117071,0,33,1,0,48,12,0,1,66,783.3887,1,1,1,6
226185,0,33,1,0,28,6,1,2,34,1283.4204,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654713,635,43,1,159,7,25,0,5,18,1050.3259,1,0,0,6
763832,635,43,1,159,7,25,0,6,37,533.1462,0,0,0,6
872951,635,43,1,159,7,25,0,7,39,350.1696,1,0,0,6
982070,635,43,1,159,7,25,0,8,28,466.6264,1,0,0,6


In [50]:
raw = pd.read_csv(file)
raw

Unnamed: 0,expt_name,subjectID,age,ageGroup,gender,genderText,education,subjectNumber,block,game,...,rt1,rt2,rt3,rt4,rt5,rt6,rt7,rt8,rt9,rt10
0,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,1,...,1953.3805,1568.3188,1184.9870,2635.0035,901.6503,,,,,
1,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,2,...,883.3246,835.0115,934.9790,1884.9959,4418.3453,1035.0187,666.6861,3151.6226,1185.0118,918.3300
2,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,3,...,1166.6223,1418.3513,833.3139,2918.3260,901.6094,,,,,
3,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,4,...,1033.3192,849.9720,1301.5922,3118.2715,2185.0424,1918.3634,3018.1861,1568.3512,901.6707,950.0144
4,horizon_31GECDVAAM8H85SL37SZS8HBCON666_A4ZCS7Y...,A4ZCS7YTJS0DF,63.0,61-65,1.0,female,college,1,,5,...,949.9674,885.0107,818.3631,2434.9241,1983.3619,1235.0213,2484.9609,2535.0606,851.6448,584.9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109114,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,156,...,599.8992,516.8944,683.0779,599.9543,1034.0029,933.6416,466.9264,533.6191,383.6485,
109115,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,157,...,666.2068,599.3662,783.8645,533.9554,,,,,,
109116,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,158,...,683.5966,733.3335,749.4404,1316.8962,,,,,,
109117,horizon_3ZZAYRN1J9DDZ991ZGMMKIVGNF6TOF_A3R7RQC...,A3R7RQCSWU4848,43.0,41-45,1.0,female,college,642,4.0,159,...,684.0670,599.9653,632.3177,749.5613,749.7587,383.4958,316.8175,433.7874,450.5013,


In [51]:
# data.to_csv("./exp.csv", index=False)

## Reject bad subjects

In [52]:
c1 = []
c2 = []
c3 = []
c4 = []
subjects_to_drop = []


for sub in data['subject'].unique():
    sub_data_all = data[data['subject'] == sub]
    sub_data = sub_data_all[sub_data_all['forced'] == 0]
    mean_reward = sub_data['reward'].mean()
    choice_autocorr = sub_data['choice'].autocorr(lag=1)
    rt = sub_data['RT']

    criterion1 = (len(rt[rt < 0.03]) / len(sub_data)) > 0.2
    criterion2 = mean_reward < 52
    criterion3 = choice_autocorr > 0.85 or choice_autocorr < 0.15
    criterion4 = abs(0.5 - (sub_data['choice'] == 1).astype(float).mean()) > 0.15
    criteria_sum = int(criterion1) + int(criterion2) + int(criterion3) + int(criterion4)
    if criterion1:
        c1.append(sub)
    if criterion2:
        c2.append(sub)
    if criterion3:
        c3.append(sub)
    if criterion4:
        c4.append(sub)
    if criteria_sum >= 3:
        subjects_to_drop.append(sub)

In [53]:
print(len(c1), len(c2), len(c3), len(c4))

0 102 114 11


In [54]:
print(len(data['subject'].unique()))
data = data[~data['subject'].isin(subjects_to_drop)]
print(len(data['subject'].unique()))

636
636


In [55]:
data.to_csv("./exp_online_0919.csv", index=False)