# **GBE Data Preprocessing**

In [1]:
import scipy
import pandas as pd
import numpy as np
import math

In [2]:
# load matlab dataset

mat = scipy.io.loadmat('../Data/Rutledge_GBE_risk_data_TOD.mat')

array = mat['subjData']

inner = array[0]

table = {
    'participant': [],
    'age': [],
    'gender': [],
    'attempt number': [],
    'time of day': [],
    'design version': [],
    'trial number': [],
    'screen side': [],
    'safe value': [],
    'winning value': [],
    'losing value': [],
    'choice made': [],
    'trial outcome': [],
    'choice latency': [],
    'happiness rating': [],
    'cursor starting point': [],
    'answer time taken': [],
    'spinner time taken': [],
    'spinner angle': [],
}

participant_number = 0

# load all participants
while participant_number < 47067:  

    times_completed = 0

    person = inner[participant_number]

    data = person['data']

    while len(data[0]) > times_completed:

        current_trial = 0

        while current_trial < 30:

            table['participant'].append(person['id'][0][0])
            table['age'].append(person['age'][0][0])
            table['gender'].append(person['isFemale'][0][0])
            table['attempt number'].append(times_completed + 1)
            table['time of day'].append(person['timeOfDay'][0][times_completed])
            table['design version'].append(person['designVersion'][0][times_completed])
            table['trial number'].append(data[0][times_completed][current_trial][0])
            table['screen side'].append(data[0][times_completed][current_trial][1])
            table['safe value'].append(data[0][times_completed][current_trial][2])
            table['winning value'].append(data[0][times_completed][current_trial][3])
            table['losing value'].append(data[0][times_completed][current_trial][4])
            table['choice made'].append(data[0][times_completed][current_trial][6])
            table['trial outcome'].append(data[0][times_completed][current_trial][7])
            table['choice latency'].append(data[0][times_completed][current_trial][8])
            table['happiness rating'].append(data[0][times_completed][current_trial][9])
            table['cursor starting point'].append(data[0][times_completed][current_trial][10])
            table['answer time taken'].append(data[0][times_completed][current_trial][11])
            table['spinner time taken'].append(data[0][times_completed][current_trial][13])
            table['spinner angle'].append(data[0][times_completed][current_trial][14])

            current_trial += 1

        times_completed += 1

    participant_number += 1

#data frame for trial results (provided values only, no calculations)
df = pd.DataFrame(table)
display(df)


Unnamed: 0,participant,age,gender,attempt number,time of day,design version,trial number,screen side,safe value,winning value,losing value,choice made,trial outcome,choice latency,happiness rating,cursor starting point,answer time taken,spinner time taken,spinner angle
0,1,3,0,1,0.812998,1,1.0,1.0,-55.0,0.0,-220.0,1.0,-220.0,18.239,70.0,50.0,6.957,9.305,0.044
1,1,3,0,1,0.812998,1,2.0,1.0,45.0,122.0,0.0,0.0,45.0,5.023,,,,,
2,1,3,0,1,0.812998,1,3.0,1.0,55.0,132.0,0.0,1.0,132.0,4.317,82.0,50.0,4.646,9.675,3.977
3,1,3,0,1,0.812998,1,4.0,1.0,-35.0,0.0,-70.0,0.0,-35.0,6.357,,,,,
4,1,3,0,1,0.812998,1,5.0,1.0,-55.0,0.0,-120.0,0.0,-55.0,9.397,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731735,47067,1,0,1,0.706412,5,26.0,1.0,0.0,40.0,-37.0,1.0,-37.0,7.180,,,,4.001,1.049
2731736,47067,1,0,1,0.706412,5,27.0,1.0,0.0,75.0,-90.0,0.0,0.0,2.000,,,,,
2731737,47067,1,0,1,0.706412,5,28.0,1.0,35.0,131.0,0.0,1.0,131.0,1.633,56.0,56.0,1.069,4.149,5.650
2731738,47067,1,0,1,0.706412,5,29.0,1.0,-55.0,0.0,-105.0,0.0,-55.0,8.980,,,,,


In [3]:
# use only version 1 of the app
df2 = df[df['design version'] < 2]

# use only a participant's first attempt
df2 = df2[df2['attempt number'] < 2]

# add value signaling terms (CR, EV, PE)
certain_val = []
expected_val = []
prediction_error = []
for index in df2.index:
    if df2['choice made'][index] == 0:
        certain_val.append(df2['safe value'][index])
        expected_val.append(0)
    else:
        certain_val.append(0)
        expected_val.append((df2['winning value'][index] + df2['losing value'][index]) / 2)

df2.insert(19, 'certain reward', certain_val)
df2.insert(20, 'expected value', expected_val)

for index in df2.index:
    if df2['choice made'][index] == 1:
        prediction_error.append(df2['trial outcome'][index] - df2['expected value'][index])
    else:
        prediction_error.append(0)

df2.insert(21, 'prediction error', prediction_error)

print('Number of participants: ', len(df2)/30)

df2 = df2[['participant', 'certain reward', 'expected value', 'prediction error', 'happiness rating']]

df2.to_csv('../Data/GBE_Data.csv', index=False)

Number of participants:  16337.0


In [4]:
# remove participants with limited data variability

save = pd.read_csv('../Data/GBE_Data.csv')
means =[]
keep_list = []
exclusions1 = 0
exclusions2 = 0

for i in range(0, 50000):
 p = save[save['participant']==i].copy()
 if len(p) > 0:
     p.reset_index(inplace=True)
     H = p['happiness rating'] 
     H = H[~np.isnan(H)]
     save_val = -1
     count = 1
     good = True
     means.append(np.mean(H))

     # remove participants with more than 6 consecutive equivalent happiness ratings
     for num in H:
          if count > 6:       
               good = False
               break
          elif num == save_val:
               count += 1
          else:
               count = 1
               save_val = num
               
     if good == True:
          keep_list.append(p)
     else: 
          exclusions1 += 1

print(len(keep_list))

keep_list2 = []

mean = np.mean(means)
sd = np.std(means)

# remove those with participants with mean happiness scores 2.5 standard deviations from the average
for i, p in enumerate(keep_list):
    p_mean = np.nanmean(p['happiness rating'])
    if p_mean > mean + 2.5 * sd or p_mean < mean - 2.5 * sd:
          exclusions2 += 1
    else:
          keep_list2.append(p)

print('Valid participants: ', len(keep_list2), 'Low happiness variability exclusions: ', str(exclusions1), 'Extreme mean happiness exclusions: ', str(exclusions2))

out = pd.concat(keep_list2, ignore_index=True)
out.reset_index(inplace=True)

if 'level_0' in out.columns:
    out = out.drop(columns='level_0')
if 'index' in out.columns:
    out = out.drop(columns='index')

out.to_csv('../Data/Lmfit_GBE_Data_Cleaned.csv', index=False)

14758
Valid participants:  14630 Low happiness variability exclusions:  1579 Extreme mean happiness exclusions:  128


In [5]:
# add weighted summation values for CR, EV, PE and linear/cubic attraction terms for SINDy

df = pd.read_csv('../Data/Lmfit_GBE_Data_Cleaned.csv')

n = 30
list_df = [df[i:i+n].copy() for i in range(0, len(df), n)]
new_list = []

for subframe in list_df:
    start = subframe.index[0]
    baseline = np.mean(subframe['happiness rating'])
    CR_gamma4 = []
    CR_gamma6 = []
    CR_gamma8 = []
    EV_gamma4 = []
    EV_gamma6 = []
    EV_gamma8 = []
    PE_gamma4 = []
    PE_gamma6 = []
    PE_gamma8 = []
    linear = []
    cubic = []
    for i in subframe.index:
        sumCR4 = 0
        sumCR6 = 0
        sumCR8 = 0
        sumEV4 = 0
        sumEV6 = 0
        sumEV8 = 0
        sumPE4 = 0
        sumPE6 = 0
        sumPE8 = 0
        for j in range(start, i+1):
            sumCR4 += subframe['certain reward'][j] * 0.4**(i-j)
            sumCR6 += subframe['certain reward'][j] * 0.6**(i-j)
            sumCR8 += subframe['certain reward'][j] * 0.8**(i-j)
            sumEV4 += subframe['expected value'][j] * 0.4**(i-j)
            sumEV6 += subframe['expected value'][j] * 0.6**(i-j)
            sumEV8 += subframe['expected value'][j] * 0.8**(i-j)
            sumPE4 += subframe['prediction error'][j] * 0.4**(i-j)
            sumPE6 += subframe['prediction error'][j] * 0.6**(i-j)
            sumPE8 += subframe['prediction error'][j] * 0.8**(i-j) 
        CR_gamma4.append(sumCR4)
        CR_gamma6.append(sumCR6)
        CR_gamma8.append(sumCR8)
        EV_gamma4.append(sumEV4)
        EV_gamma6.append(sumEV6)
        EV_gamma8.append(sumEV8)
        PE_gamma4.append(sumPE4)
        PE_gamma6.append(sumPE6)
        PE_gamma8.append(sumPE8)
        if math.isnan(subframe['happiness rating'][i]) == False:
            linear.append(baseline - subframe['happiness rating'][i])
            cubic.append(pow((baseline - subframe['happiness rating'][i]), 3)*0.01)
        else:
            linear.append(np.nan)
            cubic.append(np.nan)
    subframe['CR4'] = CR_gamma4
    subframe['CR6'] = CR_gamma6
    subframe['CR8'] = CR_gamma8
    subframe['EV4'] = EV_gamma4
    subframe['EV6'] = EV_gamma6
    subframe['EV8'] = EV_gamma8
    subframe['PE4'] = PE_gamma4
    subframe['PE6'] = PE_gamma6
    subframe['PE8'] = PE_gamma8
    subframe['LA'] = linear
    subframe['CA'] = cubic

    # approximate derivatives for CR, EV, PE
    dCR_gamma4 = []
    dCR_gamma6 = []
    dCR_gamma8 = []
    dEV_gamma4 = []
    dEV_gamma6 = []
    dEV_gamma8 = []
    dPE_gamma4 = []
    dPE_gamma6 = []
    dPE_gamma8 = []
    dCR_gamma4.append(0)
    dCR_gamma6.append(0)
    dCR_gamma8.append(0)
    dEV_gamma4.append(0)
    dEV_gamma6.append(0)
    dEV_gamma8.append(0)
    dPE_gamma4.append(0)
    dPE_gamma6.append(0)
    dPE_gamma8.append(0)
    for i in subframe.index:
        if (i-1) in subframe.index:
            dCR_gamma4.append(subframe['CR4'][i]-subframe['CR4'][i-1])
            dCR_gamma6.append(subframe['CR6'][i]-subframe['CR6'][i-1])
            dCR_gamma8.append(subframe['CR8'][i]-subframe['CR8'][i-1])
            dEV_gamma4.append(subframe['EV4'][i]-subframe['EV4'][i-1])
            dEV_gamma6.append(subframe['EV6'][i]-subframe['EV6'][i-1])
            dEV_gamma8.append(subframe['EV8'][i]-subframe['EV8'][i-1])
            dPE_gamma4.append(subframe['PE4'][i]-subframe['PE4'][i-1])
            dPE_gamma6.append(subframe['PE6'][i]-subframe['PE6'][i-1])
            dPE_gamma8.append(subframe['PE8'][i]-subframe['PE8'][i-1])
    subframe['dCR4'] = dCR_gamma4
    subframe['dCR6'] = dCR_gamma6
    subframe['dCR8'] = dCR_gamma8
    subframe['dEV4'] = dEV_gamma4
    subframe['dEV6'] = dEV_gamma6
    subframe['dEV8'] = dEV_gamma8
    subframe['dPE4'] = dPE_gamma4
    subframe['dPE6'] = dPE_gamma6
    subframe['dPE8'] = dPE_gamma8
    subframe.dropna(subset=['happiness rating'], inplace=True)
    subframe.reset_index(drop=True)

    # shift dataframe entries to align with SINDy format
    subframe[['dCR4','dCR6','dCR8','dEV4','dEV6','dEV8', 'dPE4', 'dPE6', 'dPE8']] = subframe[['dCR4','dCR6','dCR8','dEV4','dEV6','dEV8', 'dPE4', 'dPE6', 'dPE8']].shift(-1).fillna(0)
    new_list.append(subframe)
out = pd.concat(new_list, ignore_index=True)
if 'level_0' in out.columns:
    out = out.drop(columns='level_0')
if 'index' in out.columns:
    out = out.drop(columns='index')
out.to_csv('../Data/GBE_Data_SINDy.csv', index=False)