In [1]:
# ZuCo v1 &amp; v2 Unified Data Loader

**Objective:** This notebook loads, processes, and unifies word-level eye-tracking (ET) and EEG data from the ZuCo v1 and v2 datasets. It is designed to create a clean, modeling-ready dataset that harmonizes the different data structures of the two versions.
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 1. Setup and Configuration
Import necessary libraries (pandas, numpy, scipy, h5py, joblib). Define global paths for raw data input and processed data output. Include helper functions for logging and token normalization.
</VSCode.Cell>
<VSCode.Cell language="python">
import os, time, warnings, re, json
from pathlib import Path
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

# Dependency checks
try:
    from scipy.io import loadmat
    _HAS_SCIPY = True
except ImportError:
    _HAS_SCIPY = False
    warnings.warn("scipy not found, .mat file processing will be skipped.")

try:
    import h5py
    _HAS_H5PY = True
except ImportError:
    _HAS_H5PY = False
    warnings.warn("h5py not found, v7.3 .mat file processing will be skipped.")

# --- Global paths and configuration ---
BASE_DIR = Path.cwd()
RAW_V1 = BASE_DIR.parent / 'data/raw_public/zuco/v1'
RAW_V2 = BASE_DIR.parent / 'data/raw_public/zuco/v2'
PROC = BASE_DIR.parent / 'data/processed'
RPTS = BASE_DIR / 'reports'
PROC.mkdir(parents=True, exist_ok=True)
RPTS.mkdir(parents=True, exist_ok=True)
WORDBOUNDS_CACHE = {}

# --- Helper functions ---
def heartbeat(m):
    print(f"[{time.strftime('%H:%M:%S')}] {m}")

def norm_token(s):
    return re.sub(r"[\W_]+", "", s.lower()) if isinstance(s, str) else s

heartbeat('Libraries and paths are set up.')
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 2. Define Data Processing Functions
Define all functions required for processing the data. This includes functions to read MATLAB .mat files (both standard and v7.3 HDF5), extract word-level text and timing data for both ZuCo v1 and v2, process individual subject sessions, and a main function to build the dataset in parallel using joblib.
</VSCode.Cell>
<VSCode.Cell language="python">
def _get_v2_word_texts(subject_id, task_name_long, version_path):
    """Extracts word texts from ZuCo v2 results.mat files (HDF5 format)."""
    if not _HAS_H5PY:
        return []
    
    task_name_short = task_name_long.split(' - ')[-1]
    results_path = version_path / task_name_long / 'Matlab files' / f"results{subject_id}_{task_name_short}.mat"
    
    if not results_path.exists():
        return []

    words = []
    try:
        with h5py.File(results_path, 'r') as f:
            if 'sentenceData' not in f or 'word' not in f['sentenceData']:
                return []
            
            word_refs = f['sentenceData']['word']
            for ref_array in word_refs:
                for ref in ref_array:
                    word_obj = f[ref]
                    words.append(''.join(chr(c[0]) for c in word_obj))
    except Exception as e:
        print(f"Error loading v2 word texts from {results_path}: {e}")
    return words

def _find_wordbounds_file(version_path, subject_id, session_id):
    """Finds the wordbounds file with caching, checking for different naming conventions."""
    cache_key = str(version_path)
    if cache_key not in WORDBOUNDS_CACHE:
        WORDBOUNDS_CACHE[cache_key] = list(version_path.rglob('wordbounds*.mat'))
    
    # Generate possible filenames to check
    task_short = re.sub(r'\d+$', '', session_id)
    alt_ids = {session_id, session_id.replace("SR", "SNR"), session_id.replace("SNR", "SR")}
    
    for s_id in alt_ids:
        patterns = [
            f"wordbounds_{s_id}.mat",
            f"wordbounds_{s_id}_{subject_id}.mat",
            f"wordbounds_{task_short}_{subject_id}.mat"
        ]
        for f_path in WORDBOUNDS_CACHE[cache_key]:
            if f_path.name in patterns:
                return f_path
    return None

def _process_session(eyedata, wb_data, tb_data, subject_id, task, version, word_texts=None):
    """Core logic to process fixations against word and sentence boundaries."""
    rows = []
    word_idx_global = 0
    min_latency = np.min(eyedata.fixations[:, 0])

    for sent_idx, sent_data in enumerate(wb_data):
        if version == 'v1':
            words_in_sent = sent_data.content if wb_data.dtype == 'O' and hasattr(sent_data, 'content') else sent_data
            n_words = len(words_in_sent)
        else: # v2
            n_words = len(sent_data)

        if tb_data is None or sent_idx >= len(tb_data):
            continue

        sent_start, sent_end = tb_data[sent_idx][0], tb_data[sent_idx][1]
        duration = (sent_end - sent_start) / n_words if n_words > 0 else 0

        for word_idx_in_sent in range(n_words):
            start_t = sent_start + (word_idx_in_sent * duration) + min_latency
            end_t = start_t + duration
            
            if version == 'v1':
                content = f'word_{word_idx_global}'
            else: # v2
                if word_texts and word_idx_global < len(word_texts):
                    content = word_texts[word_idx_global]
                else:
                    continue # Skip if we've run out of text
            
            fixations = eyedata.fixations[(eyedata.fixations[:, 0] >= start_t) & (eyedata.fixations[:, 0] < end_t)]
            
            ffd = fixations[0, 2] if len(fixations) > 0 else 0
            trt = np.sum(fixations[:, 2]) if len(fixations) > 0 else 0
            gd = np.sum(fixations[:-1, 2]) if len(fixations) > 1 else ffd
            
            rows.append({'Subject': subject_id, 'Task': task, 'Dataset': version, 'Word': content, 'FFD': ffd, 'GD': gd, 'TRT': trt})
            word_idx_global += 1
            
    return rows

def _process_subject_session(subject_id, task_name, session_id, version_path_str):
    """Loads files for a single subject/session and orchestrates the processing."""
    version_path = Path(version_path_str)
    if not _HAS_SCIPY: return "scipy not installed"

    et_path = version_path / task_name / 'Preprocessed' / subject_id / f"{subject_id}_{session_id}_corrected_ET.mat"
    if not et_path.exists(): return f"ET file not found: {et_path.name}"

    wb_path = _find_wordbounds_file(version_path, subject_id, session_id)
    if not wb_path: return f"Wordbounds file not found for {subject_id}/{session_id}"

    try:
        et_mat = loadmat(et_path, squeeze_me=True, struct_as_record=False)
        wb_mat = loadmat(wb_path, squeeze_me=True, struct_as_record=False)
    except Exception as e: return f"Could not load .mat file: {e}"

    eyedata = et_mat.get('eyeevent')
    wb_data = wb_mat.get('wordbounds')
    tb_data = wb_mat.get('textbounds')

    if not (eyedata and hasattr(eyedata, 'fixations') and wb_data is not None): return "Invalid .mat structure"
    if hasattr(eyedata.fixations, 'data'): eyedata.fixations = eyedata.fixations.data

    version = version_path.name
    task_short = task_name.split(' - ')[-1].split('-')[-1].strip()
    
    word_texts = None
    if version == 'v2':
        word_texts = _get_v2_word_texts(subject_id, task_name, version_path)
        if not word_texts: return "V2 word texts not found."

    rows = _process_session(eyedata, wb_data, tb_data, subject_id, task_short, version, word_texts)
    
    if not rows: return "Failed to extract any valid word-level data."
    return pd.DataFrame(rows)

def build_dataset_parallel(version_path):
    """Discovers all sessions and processes them in parallel."""
    jobs = []
    version_path_str = str(version_path.resolve())
    
    for et_path in version_path.rglob('*_corrected_ET.mat'):
        match = re.match(r'(\w+)_(\w+)_corrected_ET\.mat', et_path.name)
        if match:
            subj, sess = match.groups()
            task = next((p.name for p in et_path.parents if 'task' in p.name.lower()), None)
            if task:
                jobs.append(delayed(_process_subject_session)(subj, task, sess, version_path_str))

    if not jobs: return pd.DataFrame(), []
    
    heartbeat(f"Processing {len(jobs)} sessions from {version_path.name}...")
    results = Parallel(n_jobs=-1)(jobs)
    
    dfs = [res for res in results if isinstance(res, pd.DataFrame)]
    errors = [res for res in results if isinstance(res, str)]
    
    if not dfs: return pd.DataFrame(), errors
        
    df = pd.concat(dfs, ignore_index=True)
    heartbeat(f"Completed {version_path.name}. Got {len(df)} rows from {len(dfs)} sessions.")
    return df, errors

heartbeat('Data processing functions are defined.')
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 3. Process ZuCo v1 and v2 Datasets
Execute the parallel processing function for both the ZuCo v1 and v2 raw data directories. This will process all subject sessions and create two separate pandas DataFrames, one for each dataset version, while reporting progress and any errors encountered.
</VSCode.Cell>
<VSCode.Cell language="python">
# --- Main Execution: Build v1 & v2 in Parallel ---
df_v1, errors_v1 = build_dataset_parallel(RAW_V1)
df_v2, errors_v2 = build_dataset_parallel(RAW_V2)

# --- Report Errors ---
all_errors = {'v1': errors_v1, 'v2': errors_v2}
for version, errors in all_errors.items():
    if errors:
        heartbeat(f"Encountered {len(errors)} errors in {version}:")
        for err, count in pd.Series(errors).value_counts().items():
            print(f"  - [{count} sessions] {err}")
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 4. Unify, Clean, and Save Datasets
Concatenate the v1 and v2 DataFrames into a single unified dataset. Apply a canonicalization function to normalize tokens and data types. Remove any duplicate rows. Save the individual v1, v2, and the final unified DataFrames to CSV files in the processed data directory.
</VSCode.Cell>
<VSCode.Cell language="python">
def canonicalize(df: pd.DataFrame) -> pd.DataFrame:
    """Applies final cleaning and type conversions."""
    if df.empty: return df
    df = df.copy()
    df['token_norm'] = df['Word'].astype(str).map(norm_token)
    for cat in ['Subject', 'Dataset', 'Task']:
        df[cat] = df[cat].astype(str)
    return df

# --- Unify, Canonicalize, and Save ---
all_df = pd.concat([d for d in [df_v1, df_v2] if not d.empty], ignore_index=True)

if all_df.empty:
    warnings.warn('No ZuCo data was processed. Check raw data paths and .mat files.')
else:
    all_df = canonicalize(all_df)
    # Drop duplicates that might arise from processing errors
    key_cols = ['Subject', 'Task', 'Word', 'FFD', 'GD', 'TRT']
    all_df = all_df.drop_duplicates(subset=key_cols).reset_index(drop=True)
    heartbeat(f"Unified and canonicalized {len(all_df)} rows for {all_df['Subject'].nunique()} subjects.")

    # Save outputs
    v1_out = PROC / 'zuco_v1_unified.csv'
    v2_out = PROC / 'zuco_v2_unified.csv'
    uni_out = PROC / 'zuco_unified_et_eeg.csv'
    
    if not df_v1.empty:
        df_v1.to_csv(v1_out, index=False)
        heartbeat(f"Saved v1 data to {v1_out}")
    if not df_v2.empty:
        df_v2.to_csv(v2_out, index=False)
        heartbeat(f"Saved v2 data to {v2_out}")
        
    all_df.to_csv(uni_out, index=False)
    heartbeat(f"Saved unified data to {uni_out}")
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 5. Generate Quality Assurance Report
Create a dictionary containing QA metrics such as total rows, number of subjects, data coverage for eye-tracking and EEG features, and file existence checks. Save this report as a JSON file for verification and record-keeping.
</VSCode.Cell>
<VSCode.Cell language="python">
# --- QA Report ---
qa_report = {}
if 'all_df' in locals() and not all_df.empty:
    v1_out = PROC / 'zuco_v1_unified.csv'
    v2_out = PROC / 'zuco_v2_unified.csv'
    uni_out = PROC / 'zuco_unified_et_eeg.csv'
    
    qa_report = {
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'files': {
            'v1_output': str(v1_out), 'v1_exists': v1_out.exists(),
            'v2_output': str(v2_out), 'v2_exists': v2_out.exists(),
            'unified_output': str(uni_out), 'unified_exists': uni_out.exists(),
        },
        'row_counts': {'v1': len(df_v1), 'v2': len(df_v2), 'total': len(all_df)},
        'subject_counts': {'v1': df_v1['Subject'].nunique() if not df_v1.empty else 0,
                           'v2': df_v2['Subject'].nunique() if not df_v2.empty else 0,
                           'total': all_df['Subject'].nunique()},
        'et_coverage_pct': {c: f"{all_df[c].notna().mean()*100:.1f}%" for c in ['FFD','GD','TRT'] if c in all_df},
        'error_summary': {
            'v1_errors': pd.Series(errors_v1).value_counts().to_dict() if errors_v1 else {},
            'v2_errors': pd.Series(errors_v2).value_counts().to_dict() if errors_v2 else {}
        }
    }
    
    qa_path = RPTS / 'zuco_loader_qa_report.json'
    with open(qa_path, 'w') as f:
        json.dump(qa_report, f, indent=2)
    
    heartbeat(f"Saved QA report to {qa_path}")
    print("--- QA Report ---")
    print(json.dumps(qa_report, indent=2))
else:
    warnings.warn("No data processed, skipping QA report.")
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 6. Final Data Integrity Check
Load the final unified CSV file that was just saved. Display the first few rows (`.head()`), the shape of the DataFrame, a summary of null values, and descriptive statistics for the key numerical columns to visually verify the output.
</VSCode.Cell>
<VSCode.Cell language="python">
# --- Integrity Check ---
final_output_path = PROC / 'zuco_unified_et_eeg.csv'

if final_output_path.exists():
    heartbeat(f"Loading final unified data from {final_output_path} for integrity check...")
    check_df = pd.read_csv(final_output_path)

    print("\n--- Data Head ---")
    print(check_df.head())

    print(f"\n--- Data Shape ---")
    print(f"Rows: {check_df.shape[0]}, Columns: {check_df.shape[1]}")

    print("\n--- Null Value Summary ---")
    print(check_df.isnull().sum())

    print("\n--- Descriptive Statistics for ET Metrics ---")
    et_cols = [c for c in ['FFD', 'GD', 'TRT'] if c in check_df.columns]
    if et_cols:
        print(check_df[et_cols].describe())
    else:
        print("No ET columns found to describe.")
    
    heartbeat("Integrity check complete.")
else:
    warnings.warn(f"Final output file not found at {final_output_path}. Cannot perform integrity check.")
</VSCode.Cell>
<VSCode.Cell language="markdown">
### 7. (Optional) Data Recovery for Corrupted Files
Provide a shell script cell using `wget` and `unzip` to re-download and extract the ZuCo v2 data from its official source. This step is for troubleshooting in case the local `.mat` files are corrupted, which can cause HDF5 loading errors.
</VSCode.Cell>
<VSCode.Cell language="bash">
# This cell is optional. Run it if you suspect the ZuCo v2 data files are corrupted.
# It will download and overwrite the task 2 data.

# Define paths
ZIP_FILE="../data/raw_public/zuco/v2/task2-TSR.zip"
EXTRACT_DIR="../data/raw_public/zuco/v2/"

# Create directory if it doesn't exist
mkdir -p $EXTRACT_DIR

# Download the zip file from OSF
echo "Downloading ZuCo v2 Task 2 data from OSF..."
wget -O $ZIP_FILE "https://osf.io/download/5e6b614f87b00100093a2199/"

# Unzip the file, overwriting existing files
echo "Extracting data... This will overwrite existing files."
unzip -o $ZIP_FILE -d $EXTRACT_DIR

echo "Data recovery process complete. Please re-run the notebook from the top."


SyntaxError: invalid syntax (2861286127.py, line 3)