In [1]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import json
import cv2
from tqdm import tqdm
from matplotlib.animation import FuncAnimation
from IPython.display import display, HTML
import numpy as np
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle

# Load the postprocess_redgreen_human_data.py file
from postprocess_redgreen_human_data import *

##############################################################
# DEAR EXPERIMENTER, DEFINE ALL VARIABLES BELOW IN THIS CELL #
##############################################################

###################################################################################
expected_num_participants = 59 # expected number of participants
participant_FPS = 30 # frames per second given to participants
# NOTE: this is the file name of the human data pkl file you will extract
human_data_pkl_file_name = "human_data_v1.pkl"
db_path = os.path.join(os.getcwd(), "cogsci_2025_human_raw_database.db")
demographic_path = os.path.join(os.getcwd(), "cogsci_2025_human_demographics.csv")
path_to_data = os.path.join(os.getcwd(), "trial_data", "pilot_final")
#################################################################################

## Extract, preprocess and save data into pkl file

In [2]:
session_df, trial_df, keystate_df, rgplot_df, valid_trial_ids, global_trial_names = extract_human_data(db_path, path_to_data)

occlusion_durations, occlusion_frames, continuous_occlusion_periods, all_periods_seconds = extract_occlusion_data(path_to_data, participant_FPS)

# Extract raw trial data
entries = os.listdir(path_to_data)
e_folders = sorted([entry for entry in entries if entry.startswith('E')])
e_paths = [os.path.join(os.path.join(path_to_data, entry), 'data.npz') for entry in e_folders]
data_npzs = [np.load(e_path, allow_pickle=True) for e_path in e_paths]

# print balance of red and green trials
reds = [x.get("rg_outcome").item() == 'red' for x in data_npzs]
greens = [x.get("rg_outcome").item() == 'green' for x in data_npzs]
print("Number of red trials: ", sum(reds))
print("Number of green trials: ", sum(greens))

# Extract position data
position_data = {}
for i in range(len(e_paths)):
    position_data[e_folders[i]] = {
        int(k): {'x': v['x'], 'y': v['y']} for 
        k, v in data_npzs[i].get("step_data", {}).item().items()}

# Save to a pickle file
with open(human_data_pkl_file_name, "wb") as f:
    pickle.dump({"Session": session_df, "Trial": trial_df, "KeyState": keystate_df,
        'occlusion_durations' : occlusion_durations, 'continuous_occlusion_periods':
        continuous_occlusion_periods, 'occlusion_frames': occlusion_frames,
        'all_periods_seconds': all_periods_seconds, 'position_data': position_data,
        'redgreen_df': rgplot_df}, f)


100%|██████████| 54/54 [01:00<00:00,  1.13s/it]

Summary Statistics of Occlusion Durations:
mean_duration: 4.72
median_duration: 4.07
max_duration: 8.13
min_duration: 2.90
total_scenes_with_occlusion: 34.00

Occlusion Durations (in seconds):
{'E1': 2.9, 'E11': 3.8666666666666667, 'E13': 3.566666666666667, 'E17': 4.8, 'E19': 3.6666666666666665, 'E21': 5.633333333333334, 'E23': 3.3333333333333335, 'E25': 7.2, 'E27': 5.3, 'E29': 6.866666666666666, 'E3': 2.9, 'E31': 5.9, 'E32': 3.8, 'E33': 6.3, 'E34': 6.066666666666666, 'E35': 3.8666666666666667, 'E36': 4.233333333333333, 'E37': 3.2, 'E38': 6.5, 'E39': 4.933333333333334, 'E40': 4.8, 'E41': 3.9, 'E42': 3.3333333333333335, 'E43': 7.133333333333334, 'E44': 3.3333333333333335, 'E45': 5.466666666666667, 'E46': 3.433333333333333, 'E47': 3.2333333333333334, 'E48': 5.2, 'E49': 3.066666666666667, 'E5': 3.066666666666667, 'E50': 8.133333333333333, 'E7': 8.133333333333333, 'E9': 3.433333333333333}

Summary Statistics of Continuous Occlusion Periods:
mean_continuous_duration: 4.22
median_continuous_




## Check duplicates, print demographic info and see score distribution per participant

In [None]:
###########################################################################################
# Unit test to ensure no duplicated trials and all trials have the appropriate data count #
##########################################################################################
duplicates = find_duplicate_completed_trials(trial_df)
if not duplicates.empty:
    print("Duplicate completed trials found:")
    print(duplicates)
else:
    print("No duplicate completed trials found.")
trial_counts = count_completed_trials_by_global_name(trial_df)
trial_counts[trial_counts['count'] != expected_num_participants//2]

In [None]:
####################
# Demographic Data #
####################
print_demo_data(session_df, demographic_path)

In [None]:
#######################################################################
# Distribution of scores identified in CURRENT valid participant data #
#######################################################################
plot_scores_distribution(trial_df)