In [1]:
import pandas as pd
import numpy as np

# data processing

In [2]:
# get csv
# df_long = pd.read_csv("data/exp_TMS.csv")


# files = [
#     "data/LIFESPAN_SZcontrols.csv", 
#     "data/LIFESPAN_agingAZ_all.csv", 
#     "data/LIFESPAN_sommerville.csv", 
#     "data/LIFESPAN_students.csv",
#     "data/TMS_horizonTask.csv",
# ]

# files = [
#     "data/exp_TMS.csv", 
#     "data/exp_all.csv", 
# ]
# df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

df_all = pd.read_csv("data/exp_all_filenames.csv")
df_TMS = pd.read_csv("data/exp_TMS.csv")
df_TMS["file_name"] = "exp_TMS.csv"

df_TMS['subject'] = df_TMS['subject'].astype(str) + '_TMS'
df_all['subject'] = df_all['subject'].astype(str)
df = pd.concat([df_all, df_TMS], ignore_index=True)

In [3]:
df["trial"] = df["trial"].astype(int)

# Rename value_option columns to match original format (m1, m2)
df = df.rename(columns={"value_option0": "m1", "value_option1": "m2"})

# Redefine pivoting function
def pivot_feature(df, feature, prefix):
    return df.pivot_table(
        index=["subject", "block", "m1", "m2", "uncertainty", "horizon"],
        columns="trial",
        values=feature
    ).rename(columns=lambda x: f"{prefix}{x}")

# Pivot reward, choice, RT
reward_wide = pivot_feature(df, "reward", "r")
choice_wide = pivot_feature(df, "choice", "c")
rt_wide = pivot_feature(df, "RT", "rt")

# Get static columns and merge all together
df_static = df.drop_duplicates(subset=["subject", "block", "m1", "m2", "uncertainty", "horizon"])
df_static = df_static.set_index(["subject", "block", "m1", "m2", "uncertainty", "horizon"])
df_wide = pd.concat([df_static, reward_wide, choice_wide, rt_wide], axis=1).reset_index()

df_wide.columns

Index(['subject', 'block', 'm1', 'm2', 'uncertainty', 'horizon', 'age',
       'gender', 'file_name', 'trial', 'reward', 'RT', 'choice', 'forced',
       'mask', 'r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9',
       'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'rt0',
       'rt1', 'rt2', 'rt3', 'rt4', 'rt5', 'rt6', 'rt7', 'rt8', 'rt9'],
      dtype='object')

In [4]:
df_wide

Unnamed: 0,subject,block,m1,m2,uncertainty,horizon,age,gender,file_name,trial,...,rt0,rt1,rt2,rt3,rt4,rt5,rt6,rt7,rt8,rt9
0,0,0,40,36,1,6,18,-99,LIFESPAN_students.csv,0,...,184.143661,4.635614,1.122532,2.833540,2.738855,5.263696,5.763985,1.940346,0.937745,2.472841
1,0,1,60,48,0,6,18,-99,LIFESPAN_students.csv,0,...,1.184903,1.013671,1.305989,8.874533,2.708196,2.426142,1.115581,1.370487,1.103393,1.046302
2,0,2,40,36,0,6,18,-99,LIFESPAN_students.csv,0,...,0.832413,0.468084,0.566725,0.445963,1.810017,1.325654,0.892086,0.780645,1.881181,0.576540
3,0,3,40,48,0,1,18,-99,LIFESPAN_students.csv,0,...,0.830383,0.553779,0.817720,1.686227,0.910277,,,,,
4,0,4,40,10,0,1,18,-99,LIFESPAN_students.csv,0,...,0.663673,1.049479,0.692706,3.581879,4.616839,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170980,1032.0_TMS,155,39,35,-1,6,21,1,exp_TMS.csv,0,...,-99.000000,-99.000000,-99.000000,-99.000000,0.250419,0.250419,0.417128,0.283480,0.450473,0.316901
170981,1032.0_TMS,156,59,51,0,1,21,1,exp_TMS.csv,0,...,-99.000000,-99.000000,-99.000000,-99.000000,0.383629,,,,,
170982,1032.0_TMS,157,63,63,0,1,21,1,exp_TMS.csv,0,...,-99.000000,-99.000000,-99.000000,-99.000000,0.217306,,,,,
170983,1032.0_TMS,158,41,31,-1,1,21,1,exp_TMS.csv,0,...,-99.000000,-99.000000,-99.000000,-99.000000,0.150339,,,,,


In [5]:
df = df_wide.rename(columns={"horizon": "gameLength", "uncertainty": "uc"})

df.columns

Index(['subject', 'block', 'm1', 'm2', 'uc', 'gameLength', 'age', 'gender',
       'file_name', 'trial', 'reward', 'RT', 'choice', 'forced', 'mask', 'r0',
       'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'c0', 'c1', 'c2',
       'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'rt0', 'rt1', 'rt2', 'rt3',
       'rt4', 'rt5', 'rt6', 'rt7', 'rt8', 'rt9'],
      dtype='object')

In [6]:
# df.to_csv("data/my_horizon_data_all_filenames.csv", index=False)

In [7]:
num_subjects = df["subject"].nunique()
print("Number of unique subjects:", num_subjects)

Number of unique subjects: 1384


In [12]:
# ---- Step 1: collapse to one row per subject per file ----
participants = df.groupby(["file_name", "subject"]).agg({
    "gender": "first"
}).reset_index()

# ---- Step 2: per-file summary ----
summary = participants.groupby("file_name").agg(
    total_participants=("subject", "nunique"),
    num_female=("gender", lambda x: (x == 1).sum()),
    num_male=("gender", lambda x: (x == 0).sum())
).reset_index()

# ---- Step 3: compute TOTAL row ----
total_row = pd.DataFrame({
    "file_name": ["TOTAL"],
    "total_participants": [summary["total_participants"].sum()],
    "num_female": [summary["num_female"].sum()],
    "num_male": [summary["num_male"].sum()]
})

# ---- Step 4: append TOTAL row ----
summary_with_total = pd.concat([summary, total_row], ignore_index=True)

print(summary_with_total)



                      file_name  total_participants  num_female  num_male
0  BATTERY_HorizonTaskFinal.csv                 234          76       130
1            LIFESPAN_Harms.csv                 122          40        72
2       LIFESPAN_SZcontrols.csv                  38          11        25
3            LIFESPAN_Smith.csv                 418         303       115
4      LIFESPAN_agingAZ_all.csv                 113          53        58
5      LIFESPAN_sommerville.csv                  88          46        42
6         LIFESPAN_students.csv                 339         100       226
7                   exp_TMS.csv                  32          18        13
8                         TOTAL                1384         647       681


In [13]:
# valid age means age != -99
# valid gender means gender is 0 or 1
valid_subjects = df[
    (df["age"] != -99) &
    (df["gender"].isin([0, 1]))
]["subject"].unique()
df_clean = df[df["subject"].isin(valid_subjects)].copy()

In [14]:
# ---- Step 1: collapse to one row per subject per file ----
participants_clean = df_clean.groupby(["file_name", "subject"]).agg({
    "age": "first",
    "gender": "first"
}).reset_index()

# ---- Step 2: compute age stats per file ----
age_summary = participants_clean.groupby("file_name").agg(
    age_min=("age", "min"),
    age_max=("age", "max"),
    age_mean=("age", "mean"),
    age_sd=("age", "std")
).reset_index()

# ---- Step 3: compute TOTAL age stats ----
total_age_min = participants_clean["age"].min()
total_age_max = participants_clean["age"].max()
total_age_mean = participants_clean["age"].mean()      # true combined mean
total_age_sd = participants_clean["age"].std()         # true combined SD

total_row = pd.DataFrame({
    "file_name": ["TOTAL"],
    "age_min": [total_age_min],
    "age_max": [total_age_max],
    "age_mean": [total_age_mean],
    "age_sd": [total_age_sd]
})

# ---- Step 4: append TOTAL row ----
age_summary_with_total = pd.concat([age_summary, total_row], ignore_index=True)

print(age_summary_with_total)


                      file_name  age_min  age_max   age_mean     age_sd
0  BATTERY_HorizonTaskFinal.csv       18       74  44.567961  24.002291
1            LIFESPAN_Harms.csv       10       38  15.866071   5.343170
2       LIFESPAN_SZcontrols.csv       20       52  36.083333  10.434490
3            LIFESPAN_Smith.csv       18       40  23.715311   5.600000
4      LIFESPAN_agingAZ_all.csv       18       74  53.018018  23.006316
5      LIFESPAN_sommerville.csv       12       17  14.897727   1.742239
6         LIFESPAN_students.csv       17       50  19.560127   2.336635
7                   exp_TMS.csv       19       32  22.161290   3.643229
8                         TOTAL       10       74  27.491654  17.200268
