In [1]:
"""

EATING DISORDERS MONTECATINI

Purpose: Create input data for HDDMrl.

Esclusion criterion: 

- Participants are excluded if they not meet the threshold of having a mean  
feedback greater than 0.5 in at least one of the two stim conditions.

Groups:

- The RI group includes the RI students, the AN-R and the BN-R participants.
- The HC group is randomly selected from the HC pool of students with the 
constraint that both the food and neutral conditions must be completed.

Written by Corrado Caudek (2023-07-02)

"""

import datetime

now = datetime.datetime.now()
print("Current date and time : ")
print(now.strftime("%Y-%m-%d %H:%M:%S"))


Current date and time : 
2023-07-02 11:31:27


In [2]:
%matplotlib inline 

import os, time, csv, sys
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hddm

# Data management
pd.options.display.max_colwidth = 100

%config InlineBackend.figure_format='retina'

print("The hddm version is", hddm.__version__)

  from .autonotebook import tqdm as notebook_tqdm


The hddm version is 0.9.8


In [3]:
# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Reset display options to their default values
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [4]:
!pwd

/Users/corrado/_repositories/eating_disorders_23/src/python/PRL/01_get_hddmrl_params


In [5]:
# load raw data
data = hddm.load_csv(
    "/Users/corrado/_repositories/eating_disorders_23/data/processed/prl/input_for_hddmrl/hddm_input_v3.csv"
)
print("The original data pool includes %d participants." % data["subj_code"].nunique())


The original data pool includes 302 participants.


In [6]:
data.columns

Index(['subj_idx', 'response', 'stim', 'rt', 'trial', 'split_by', 'feedback',
       'diag_cat', 'subj_code', 'q_init'],
      dtype='object')

In [7]:
data["diag_cat"] = pd.Categorical(data["diag_cat"])
data["diag_cat"].value_counts()

HC      62080
AN      11040
RI       7520
BN       3520
AN_R     3040
BN_R     1600
Name: diag_cat, dtype: int64

Recode group membership.

In [8]:
dicat_mapping = {
    "AN": "AN",
    "AN_R": "RI",
    "BN": "BN",
    "BN_R": "RI",
    "HC": "HC",
    "RI": "RI",
}
data = data.assign(diag_cat=data["diag_cat"].map(dicat_mapping))

Remove "BN" group.

In [9]:
data = data.drop(data[data["diag_cat"] == "BN"].index)
data.groupby("diag_cat")["subj_code"].nunique()

diag_cat
AN     37
HC    212
RI     41
Name: subj_code, dtype: int64

Compute mean feedback as a function of subj_code, diag_cat, and stim.

In [10]:
data.groupby(["diag_cat", "subj_code", "stim"])["feedback"].agg(["mean", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,count
diag_cat,subj_code,stim,Unnamed: 3_level_1,Unnamed: 4_level_1
AN,am_gu_1999_02_11_937_f,food,0.50625,160
AN,am_gu_1999_02_11_937_f,neutral,0.56875,160
AN,an_de_1998_11_10_289_f,food,0.48125,160
AN,an_de_1998_11_10_289_f,neutral,0.6125,160
AN,ar_ce_2005_04_20_937_f,food,0.625,160
AN,ar_ce_2005_04_20_937_f,neutral,0.625,160
AN,as_ga_2005_06_15_329_f,food,0.5,160
AN,as_ga_2005_06_15_329_f,neutral,0.575,160
AN,au_ru_1998_09_21_806_f,food,0.5,160
AN,au_ru_1998_09_21_806_f,neutral,0.50625,160


Create a list of subj_code in the AN and RI groups in which mean feedback is lower
than 0.5 in both food and neutral conditions. Remove these subjects from the `data` DataFrame .

In [11]:
# List of subj_code levels to remove
subj_code_levels_to_remove = [
    "sa_ta_2003_11_14_150_f",
    "ar_co_1996_12_27_348_f",
    "cl_pe_2001_09_26_424_f",
    "ga_fi_2000_12_23_825_f",
    "il_fu_2002_12_30_306_f",
    "ma_za_2002_02_28_051_f",
]

# Remove rows where subj_code is in the specified levels
data = data.drop(data[data["subj_code"].isin(subj_code_levels_to_remove)].index)
# Count number of subject in each group
data.groupby("diag_cat")["subj_code"].nunique()

diag_cat
AN     36
HC    212
RI     36
Name: subj_code, dtype: int64

Random selection of 40 subjects from the HC group. 
Two constraints are used: 
1. mean feedback must be greater than 0.5 in both food and stim conditions; 
2. there are 160 trials in both food and stim conditions.

In [12]:
# Filter the rows where diag_cat is "HC"
hc_rows = data[data["diag_cat"] == "HC"]

# Group the filtered rows by subj_code and stim, count trials, and filter for 160 trials 
# in both food and neutral conditions
subj_code_counts = hc_rows.groupby(["subj_code", "stim"]).size().unstack(fill_value=0)
valid_subj_codes = subj_code_counts[
    (subj_code_counts["food"] == 160) & (subj_code_counts["neutral"] == 160)
].index

# Filter valid_subj_codes based on mean feedback greater than 0.5 in both food and 
# stim conditions
valid_subj_codes = valid_subj_codes[
    (
        hc_rows[
            hc_rows["subj_code"].isin(valid_subj_codes) & (hc_rows["stim"] == "food")
        ]["feedback"].mean()
        > 0.5
    )
    & (
        hc_rows[
            hc_rows["subj_code"].isin(valid_subj_codes) & (hc_rows["stim"] == "neutral")
        ]["feedback"].mean()
        > 0.5
    )
]

In [13]:
# Convert valid_subj_codes into a numpy unidimensional array
valid_subj_codes = valid_subj_codes.flatten().astype(str)

# Filter valid_subj_codes to select values ending with 'f'
filtered_hc_subj_codes = valid_subj_codes[np.char.endswith(valid_subj_codes, "f")]

# Set the random seed
np.random.seed(42)

# Randomly select 40 elements from the array
random_selection_hc_subj_codes = np.random.choice(filtered_hc_subj_codes, size=40, replace=False)

# Filter the rows of data where subj_code is in random_selection_hc_subj_codes or 
# diag_cat is "AN" or "RI"
df = data[
    data["subj_code"].isin(random_selection_hc_subj_codes) | (data["diag_cat"].isin(["AN", "RI"]))
]

df["diag_cat"].value_counts()

HC    12800
AN    10720
RI    10720
Name: diag_cat, dtype: int64

In [14]:
df.groupby("diag_cat")["subj_code"].nunique()

diag_cat
AN    36
HC    40
RI    36
Name: subj_code, dtype: int64

Use the `pd.factorize()` function to create a new column `subj_idx` that maps the values in the `subj_code` column to integers starting from 1.

In [15]:
df.loc[:, "subj_idx"] = pd.factorize(df["subj_code"])[0] + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [16]:
df["subj_idx"].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112])

For [structuring]([Title](https://nbviewer.org/github/hddm-devs/hddm/blob/master/hddm/examples/demo_RLHDDMtutorial.ipynb)) the DataFrame:

1. Sort trials in ascending order within subject and condition, to ensure proper updating of expected rewards.
2. Include a column called 'split_by' which identifies the different task conditions (as integers), to ensure reward updating will work properly for each condition without mixing values learned from one trial type to another.
3. Code the response column with [stimulus-coding] (http://ski.clps.brown.edu/hddm_docs/howto.html#code-subject-responses). 
4. feedback-column. This should be the reward received for the chosen option on each trial.
5. q_init. The function will set all initial q-values according to the first value in q_init. 

In [17]:
# Sort the DataFrame by 'subj_code', 'stim', and 'trial' in ascending order
df_sorted = df.sort_values(by=["subj_code", "stim", "trial"], ascending=True)
df_sorted[150:170]

Unnamed: 0,subj_idx,response,stim,rt,trial,split_by,feedback,diag_cat,subj_code,q_init
69740,94,0,food,1.148,151,0,0,HC,al_lo_2001_02_10_286_f,0.5
69742,94,1,food,0.509,152,0,1,HC,al_lo_2001_02_10_286_f,0.5
69744,94,1,food,0.369,153,0,1,HC,al_lo_2001_02_10_286_f,0.5
69746,94,0,food,0.791,154,0,1,HC,al_lo_2001_02_10_286_f,0.5
69748,94,1,food,0.578,155,0,1,HC,al_lo_2001_02_10_286_f,0.5
69750,94,1,food,1.358,156,0,0,HC,al_lo_2001_02_10_286_f,0.5
69752,94,1,food,0.341,157,0,1,HC,al_lo_2001_02_10_286_f,0.5
69754,94,0,food,0.514,158,0,0,HC,al_lo_2001_02_10_286_f,0.5
69756,94,0,food,1.1,159,0,0,HC,al_lo_2001_02_10_286_f,0.5
69758,94,1,food,0.414,160,0,1,HC,al_lo_2001_02_10_286_f,0.5


## Save DataFrame to a CSV file

In [18]:
path = "/Users/corrado/_repositories/eating_disorders_23/data/processed/prl/input_for_hddmrl/three_groups/ed_prl_data.csv"
df.to_csv(path, index=False)