# Python Session 1

## We will practice cleaning some Food choice task data

We are going to generate data from 20 individuals to practice our skills. In the task, participants rate 50 foods for healthiness, tastiness and choice. We are simulating this data below.



In [40]:
import pandas as pd
import numpy as np
import random

In [41]:
import random
import pandas as pd

# Define blocks and trial structure
blocks = ['health', 'taste', 'choice']
trials_per_block = 75
participants = range(1, 21)

# Generate 50 unique foods
base_foods = [
    'apple', 'banana', 'burger', 'carrot', 'donut', 'eggs', 'fries', 'grapes', 'ice cream', 'kale',
    'pizza', 'yogurt', 'spinach', 'steak', 'candy', 'popcorn', 'mango', 'nuts', 'cheese', 'chicken',
    'broccoli', 'chocolate', 'granola', 'lettuce', 'pasta', 'salmon', 'tofu', 'soda', 'rice', 'beans',
    'cucumber', 'peach', 'bacon', 'cereal', 'toast', 'avocado', 'beef', 'peanut butter', 'cake', 'milk',
    'watermelon', 'pear', 'turkey', 'onion rings', 'oatmeal', 'cranberries', 'syrup', 'waffles', 'cookie', 'shrimp'
]
assert len(base_foods) == 50

# Assign fat and sugar levels randomly
food_properties = {}
for food in base_foods:
    fat = random.choices(['high', 'low'], weights=[0.4, 0.6])[0]
    sugar = random.choices(['high', 'low'], weights=[0.5, 0.5])[0]
    food_properties[food] = {'fat': fat, 'sugar': sugar}

# Generate trials
all_trials = []

for participant in participants:
    for block in blocks:
        for trial_num in range(1, trials_per_block + 1):
            food = random.choice(base_foods)
            rt_missing = random.random() < 0.02  # 2% chance of missing RT
            reaction_time = None if rt_missing else round(random.uniform(0.5, 4.0), 2)
            rating = None if reaction_time is None else random.randint(1, 10)

            fat = food_properties[food]['fat']
            sugar = food_properties[food]['sugar']

            trial = {
                'participant': participant,
                'block': block,
                'trial_number': trial_num,
                'food': food,
                'reaction_time': reaction_time,
                'rating': rating,
                'fat': fat,
                'sugar': sugar
            }
            all_trials.append(trial)

# Create DataFrame
df = pd.DataFrame(all_trials)

# Validate logic: rating is only missing if RT is missing
assert all(df[df['rating'].isna()]['reaction_time'].isna())

The data are stored in a dataframe object, which we have called df
To access items in the dataframe, we need to type "df"

In [42]:
#If we want to see the data, we can just type
df

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,turkey,0.61,5.0,high,low
1,1,health,2,syrup,0.70,6.0,low,high
2,1,health,3,donut,1.08,3.0,high,high
3,1,health,4,waffles,1.72,7.0,high,low
4,1,health,5,fries,2.65,5.0,low,low
...,...,...,...,...,...,...,...,...
4495,20,choice,71,granola,1.40,3.0,low,low
4496,20,choice,72,oatmeal,2.89,4.0,high,low
4497,20,choice,73,beef,3.22,6.0,high,low
4498,20,choice,74,chicken,2.55,3.0,low,high


In [43]:
# To see anything in df we will need to reference df first
df.columns

Index(['participant', 'block', 'trial_number', 'food', 'reaction_time',
       'rating', 'fat', 'sugar'],
      dtype='object')

In [44]:
# We can also look at the values of columns
# All of these will access the food column
df.food
df['food']
df.iloc[:,3]

Unnamed: 0,food
0,turkey
1,syrup
2,donut
3,waffles
4,fries
...,...
4495,granola
4496,oatmeal
4497,beef
4498,chicken


In [45]:
# Try here with RT

In [46]:
# To analyze this data, we will first need to remove any missing trials
# let's find the missing values
df.reaction_time[df.reaction_time.isna()==True]

Unnamed: 0,reaction_time
6,
128,
152,
187,
258,
...,...
4165,
4169,
4266,
4470,


In [47]:
df.reaction_time[df.reaction_time > 3]

# What would we change to see RTs < 2 only?

Unnamed: 0,reaction_time
13,3.96
16,3.56
19,3.91
26,3.71
27,3.10
...,...
4487,3.86
4491,3.68
4493,3.21
4497,3.22


In [48]:
# make a new data frame with no missing values
df1 = df[df.reaction_time.isna()==True]

In [49]:
# Now we want to perform some calculations on this data-set
# let's start by summarizing, for one person the health rating

# Filter for participant 1 and the 'health' block
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health')]

# Remove missing ratings (i.e., where RT was missing)
valid_ratings = health_block['rating'].dropna()

# Calculate the average health rating
average_health_rating = valid_ratings.mean()

print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f}")


Participant 1's average health rating: 6.20


In [50]:
#Try for health only for low and high-fat

In [51]:
#Now let's create a new dataframe and store each persons average RT and rating for high and low fat foods

# Group by participant, block, and fat level
summary_df = (
    df
    .dropna(subset=['rating', 'reaction_time'])  # Exclude trials with missing values
    .groupby(['participant', 'block', 'fat'])
    .agg(
        average_rating=('rating', 'mean'),
        average_reaction_time=('reaction_time', 'mean'),
        trial_count=('rating', 'count')  # Optional: to see how many valid trials per group
    )
    .reset_index()
)

print(summary_df.head())


   participant   block   fat  average_rating  average_reaction_time  \
0            1  choice  high        5.297297               2.702432   
1            1  choice   low        5.500000               2.106944   
2            1  health  high        6.800000               2.031429   
3            1  health   low        5.666667               2.317179   
4            1   taste  high        5.900000               1.863667   

   trial_count  
0           37  
1           36  
2           35  
3           39  
4           30  


In [52]:
# Pivot to wide format
wide_df = summary_df.pivot_table(
    index='participant',
    columns=['block', 'fat'],
    values=['average_rating', 'average_reaction_time']
)




In [53]:
# Step 3: Flatten column names
wide_df.columns = [f'{stat}_{block}_{fat}' for stat, block, fat in wide_df.columns]
wide_df = wide_df.reset_index()

print(wide_df.head())

   participant  average_rating_choice_high  average_rating_choice_low  \
0            1                    5.297297                   5.500000   
1            2                    5.000000                   5.980392   
2            3                    4.866667                   6.121951   
3            4                    4.677419                   4.977273   
4            5                    5.793103                   5.413043   

   average_rating_health_high  average_rating_health_low  \
0                    6.800000                   5.666667   
1                    5.882353                   5.236842   
2                    5.272727                   5.264151   
3                    5.392857                   6.106383   
4                    5.545455                   5.414634   

   average_rating_taste_high  average_rating_taste_low  \
0                   5.900000                  4.840909   
1                   4.608696                  5.961538   
2                   5.8620

In [54]:
# Here try and simulate a different dataset - a monetary choice task where the participant
# selects between an immediate vs delayed reward. Compare the RT between when the participant
# chooses the immediate vs delayed option

In [55]:
# navigate to the directory
data=pd.read_csv("https://raw.githubusercontent.com/CaitlinLloyd/Psychology_Programming2025/refs/heads/main/Data/DelayDisc_example.csv")

In [56]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt']= "none"
data.loc[data['delay_left'] < data['delay_right'],'delayed_opt'] ==2
data.loc[data['delay_left'] > data['delay_right'],'delayed_opt'] ==1

Unnamed: 0,delayed_opt
0,False
1,False
2,False
3,False
4,False
...,...
109,False
111,False
112,False
114,False


In [57]:
# Now summarize the RT for each person when they chose delayed vs chose sooner reward



In [61]:
## Here calculate the average earnings per person and the number of times they chose delayed vs sooner

## Upload solution to Github

# Calculate average amount selected and count of delayed/less-delayed choices

for participant in range(1, 3):
    p_data = data[data['participant'] == participant]

    p_data['delayed_chosen'] = (p_data['choice'] == p_data['delayed_opt']).astype(int)

    p_data['chosen_amount'] = p_data.apply(
        lambda row: row['money_right'] if row['choice'] == 1 else row['money_left'],
        axis=1
    )

    avg_amount = p_data['chosen_amount'].mean()

    delayed_count = p_data['delayed_chosen'].sum()
    less_delayed_count = len(p_data) - delayed_count

    print(f"Participant {participant}:")
    print(f"  Average amount selected: {avg_amount:.2f}")
    print(f"  More-delayed choices: {delayed_count}")
    print(f"  Less-delayed choices: {less_delayed_count}\n")


Participant 1:
  Average amount selected: 18.62
  More-delayed choices: 0
  Less-delayed choices: 60

Participant 2:
  Average amount selected: 21.76
  More-delayed choices: 0
  Less-delayed choices: 60



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_data['delayed_chosen'] = (p_data['choice'] == p_data['delayed_opt']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_data['chosen_amount'] = p_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_data['delayed_chosen'] = (p_data['choice'] == p_data['delayed_opt']).astype(int)

## Extra
## These are hard exercises - not homework, for extra practice

In [None]:
# Hard

# Here simulate your own Delay Discounting Task and calculate some average metrics

In [None]:
# Very hard
# One outcome of interest is the discount rate, k, which denotes extent to which someone discounts
# value of delayed rewards (higher values = less patient)

# Here you can use chatGPT to get the formula for k - see whether you can calculate for each person
# in your dataset