In [1]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import json
import cv2
from tqdm import tqdm
from matplotlib.animation import FuncAnimation
from IPython.display import display, HTML
import numpy as np
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle

# Load the postprocess_redgreen_human_data.py file
from postprocess_redgreen_human_data import *

##############################################################
# DEAR EXPERIMENTER, DEFINE ALL VARIABLES BELOW IN THIS CELL #
##############################################################

###################################################################################
expected_num_participants = 1 # expected number of participants
participant_FPS = 30 # frames per second given to participants
# NOTE: this is the file name of the human data pkl file you will extract
human_data_pkl_file_name = "processed_ecog_dec2025_data.pkl"
# db_path = os.path.join(os.getcwd(), "cogsci_2025_human_raw_database.db")
db_path = "/Users/arijitdasgupta/Desktop/projects/red_green_projects/ecog_data/ecog_stimuli_v6_ecog_dec14_v1_redgreen.db"
# demographic_path = os.path.join(os.getcwd(), "cogsci_2025_human_demographics.csv")
# path_to_data = os.path.join(os.getcwd(), "trial_data", "pilot_final")
path_to_data = "/Users/arijitdasgupta/Desktop/projects/red_green_projects/ecog_data/ecog_stimuli_v6"
# Trial prefixes - should match those in run_redgreen_experiment.py
FAM_TRIAL_PREFIXES = ['F']  # Familiarization trial prefixes
EXP_TRIAL_PREFIXES = ['CC_control', 'CC_surprise', 'UC_positive', 'UC_negative']  # Experimental trial prefixes
#################################################################################
# Session filtering options
allow_incomplete_sessions = True  # Set to True to include incomplete sessions
session_ids = [4]  # List of session IDs to include (None for all sessions)
#################################################################################
#################################################################################

## Extract, preprocess and save data into pkl file

In [2]:
session_df, trial_df, keystate_df, rgplot_df, valid_trial_ids, global_trial_names = extract_human_data(
    db_path, path_to_data, 
    exp_trial_prefixes=EXP_TRIAL_PREFIXES, 
    fam_trial_prefixes=FAM_TRIAL_PREFIXES,
    allow_incomplete_sessions=allow_incomplete_sessions,
    session_ids=session_ids
)

keystate_by_trial = save_human_data_by_trial(trial_df, keystate_df, path_to_data)

Found 1 sessions (allow_incomplete=True, session_ids=[4])
Found 35 completed experimental trials from selected sessions
Sample trial names from database: ['UC_positive_nosurprise_15', 'CC_surprise_12', 'UC_negative_nosurprise_9', 'CC_control_9', 'CC_surprise_1']
Found 60 trial folders matching prefixes ['CC_control', 'CC_surprise', 'UC_positive', 'UC_negative']
Sample folder names: ['CC_control_1', 'CC_control_10', 'CC_control_11', 'CC_control_12', 'CC_control_13']
Created rg_outcome_df with 60 rows
After merging trial_df with rg_outcome_df: 35 rows (was 35)
After merging keystate_df: 16955 rows
After merging rgplot_df: 16955 rows
Final result: 1 sessions, 35 trials, 16955 keystates, 16955 rgplot rows
Saved human data as CSV files in /Users/arijitdasgupta/Desktop/projects/red_green_projects/ecog_data/ecog_stimuli_v6


## Check duplicates, print demographic info and see score distribution per participant

In [4]:
###########################################################################################
# Unit test to ensure no duplicated trials and all trials have the appropriate data count #
##########################################################################################
duplicates = find_duplicate_completed_trials(trial_df)
if not duplicates.empty:
    print("Duplicate completed trials found:")
    print(duplicates)
else:
    print("No duplicate completed trials found.")
trial_counts = count_completed_trials_by_global_name(trial_df)
trial_counts[trial_counts['count'] != expected_num_participants//2]

No duplicate completed trials found.


Unnamed: 0,global_trial_name,count
0,CC_control_10,1
1,CC_control_12,1
2,CC_control_14,1
3,CC_control_2,1
4,CC_control_4,1
5,CC_control_6,1
6,CC_control_7,1
7,CC_control_8,1
8,CC_control_9,1
9,CC_surprise_1,1


In [None]:
# Check for duplicate frame and trial_id combinations
duplicate_frame_trial = keystate_df.duplicated(subset=['frame', 'trial_id']).sum()
if duplicate_frame_trial > 0:
    print(f"Found {duplicate_frame_trial} duplicate rows with same frame and trial_id")
    # Show the duplicates
    duplicates = keystate_df[keystate_df.duplicated(subset=['frame', 'trial_id'], keep=False)]
    print("Duplicate entries:")
    print(duplicates.sort_values(['trial_id', 'frame']))
else:
    print("No duplicate frame and trial_id combinations found.")

keystate_df

In [None]:
####################
# Demographic Data #
####################
print_demo_data(session_df, demographic_path)

In [None]:
#######################################################################
# Distribution of scores identified in CURRENT valid participant data #
#######################################################################
plot_scores_distribution(trial_df)