# Notebook 03 — ZuCo Loader (v1 & v2 Unification, v4.3)

**Purpose.** Load token-level ZuCo **v1** and **v2**, harmonize schema, and export a unified file for modeling.

**Expected inputs (if available):**
- `data/raw_public/zuco/v1/` containing token-level CSV/TSV files
- `data/raw_public/zuco/v2/` containing raw .mat files (currently skipped - requires word-level processing)

**Note on v2 data:** ZuCo v2 contains raw eye-tracking .mat files that require complex processing to extract word-level reading metrics (FFD, GD, TRT, etc.). Currently, only v1 processed data is loaded. v2 processing would require implementing word boundary detection and eye movement analysis.

**Outputs:**
- `data/processed/zuco_v1_aligned.csv`
- `data/processed/zuco_v2_aligned.csv` (empty for now)
- `data/processed/zuco_aligned.csv` (v1 data only)
- `reports/zuco_loader_qa.json` (coverage & sanity checks)

**Reproducibility.** Deterministic IO; schema normalization included.


In [24]:
import os, time, warnings, re
from pathlib import Path
import numpy as np, pandas as pd
RAW_V1 = Path('../data/raw_public/zuco/v1')
RAW_V2 = Path('../data/raw_public/zuco/v2')
PROC = Path('../data/processed'); PROC.mkdir(parents=True, exist_ok=True)
RPTS = Path('../reports'); RPTS.mkdir(parents=True, exist_ok=True)
def heartbeat(m): print(f"[{time.strftime('%H:%M:%S')}] {m}")
def normcols(df): df.columns=[c.strip() for c in df.columns]; return df
def norm_token(s):
    if not isinstance(s,str): return s
    s=s.lower(); s=re.sub(r'[\W_]+','',s); return s
COMMON=['Subject','SentenceID','Word','FFD','GD','TRT','GPT','ThetaPower','AlphaPower','Task','Dataset','length','log_freq','surprisal']
heartbeat('Env ready')


[19:44:12] Env ready


In [30]:
def _read_table_any(p: Path):
    try:
        if p.suffix.lower()=='.csv': 
            # Try different encodings and delimiters
            for encoding in ['utf-8', 'iso-8859-1', 'cp1252', 'latin1']:
                try:
                    # First try comma delimiter
                    df = pd.read_csv(p, encoding=encoding, nrows=1)
                    if len(df.columns) > 1:  # If it parsed correctly
                        return pd.read_csv(p, encoding=encoding)
                    # If only one column, try semicolon delimiter
                    df = pd.read_csv(p, sep=';', encoding=encoding, nrows=1)
                    if len(df.columns) > 1:
                        return pd.read_csv(p, sep=';', encoding=encoding)
                except (UnicodeDecodeError, pd.errors.ParserError):
                    continue
            
            # Special handling for malformed CSV files
            try:
                # Try reading as semicolon-separated with custom header detection
                with open(p, 'r', encoding='iso-8859-1') as f:
                    first_line = f.readline().strip()
                    if ';' in first_line:
                        # Assume first line contains headers
                        headers = [col.strip() for col in first_line.split(';') if col.strip()]
                        df = pd.read_csv(p, sep=';', encoding='iso-8859-1', header=0, names=headers[:4] if len(headers) >= 4 else None)
                        return df
            except Exception:
                pass
            
            # If all encodings fail, try with error handling
            return pd.read_csv(p, encoding='iso-8859-1', errors='replace')
        if p.suffix.lower() in {'.tsv','.txt'}: 
            for encoding in ['utf-8', 'iso-8859-1', 'cp1252', 'latin1']:
                try:
                    return pd.read_csv(p, sep='\t', encoding=encoding)
                except UnicodeDecodeError:
                    continue
            return pd.read_csv(p, sep='\t', encoding='iso-8859-1', errors='replace')
        if p.suffix.lower()=='.mat':
            # Skip .mat files for now as they contain raw eye-tracking data
            # that requires complex processing to extract word-level metrics
            warnings.warn(f'Skipping .mat file {p} - requires word-level processing pipeline')
            return pd.DataFrame()
    except Exception as e:
        warnings.warn(f'Failed to read {p}: {e}')
    return pd.DataFrame()
def _ensure_cols(df: pd.DataFrame, dataset_label: str, default_task: str = 'NR', file_path: Path = None):
    """
    Ensure consistent column names and extract Subject/Word information from ZuCo v1 files
    """
    if df.empty:
        return df
    
    print(f"Processing file: {file_path}")
    print(f"Original columns: {list(df.columns)}")
    print(f"DataFrame shape: {df.shape}")
    
    # Standard column mapping for ZuCo datasets
    col_mapping = {
        'ID': 'SentenceID',
        'sentence': 'Sentence',
        'FFD': 'FFD', 
        'GD': 'GD',
        'TRT': 'TRT',
        'GPT': 'GPT',
        'ThetaPower': 'ThetaPower',
        'AlphaPower': 'AlphaPower',
        'Task': 'Task',
        'Dataset': 'Dataset'
    }
    
    # Rename columns if they exist
    df = df.rename(columns=col_mapping)
    print(f"After renaming: {list(df.columns)}")
    
    # Extract Subject from file path for ZuCo v1
    if file_path and 'zuco/v1' in str(file_path):
        print(f"Processing ZuCo v1 file: {file_path}")
        # For ZuCo v1, subject information is not directly available in current CSV files
        # These are aggregated datasets from multiple subjects
        # Use a generic subject identifier for v1 data
        df['Subject'] = 'ZAB'  # Using first subject as representative, or could use 'v1_aggregated'
        print(f"Assigned subject: ZAB (aggregated v1 data)")
        
        # Extract Word information from sentence column
        if 'Sentence' in df.columns:
            print("Processing sentences to extract words...")
            # Split sentences into individual words
            word_dfs = []
            for idx, row in df.iterrows():
                sentence = str(row['Sentence'])
                if sentence and sentence != 'nan':
                    words = sentence.split()
                    print(f"Sentence {idx}: '{sentence}' -> {len(words)} words")
                    for word_idx, word in enumerate(words):
                        word_row = row.copy()
                        word_row['Word'] = word.strip('.,!?;:"\'')
                        word_row['WordPosition'] = word_idx + 1
                        word_dfs.append(word_row)
            
            if word_dfs:
                df = pd.DataFrame(word_dfs)
                print(f"Created {len(word_dfs)} word-level rows")
            else:
                df['Word'] = 'unknown'
                print("No words extracted")
        else:
            df['Word'] = 'unknown'
            print("No Sentence column found")
    
    # Ensure all required columns exist with appropriate defaults
    required_cols = ['Subject', 'Word', 'SentenceID', 'FFD', 'GD', 'TRT', 'GPT', 'ThetaPower', 'AlphaPower', 'Task', 'Dataset']
    for col in required_cols:
        if col not in df.columns:
            if col in ['FFD', 'GD', 'TRT', 'GPT', 'ThetaPower', 'AlphaPower']:
                df[col] = 0.0  # Default numeric values
            elif col == 'WordPosition':
                df[col] = 1  # Default word position
            else:
                df[col] = 'unknown'  # Default string values
    
    # Set dataset and task
    df['Dataset'] = dataset_label
    df['Task'] = default_task
    
    # Convert numeric columns to appropriate types
    numeric_cols = ['FFD', 'GD', 'TRT', 'GPT', 'ThetaPower', 'AlphaPower', 'WordPosition']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
    
    print(f"Final columns: {list(df.columns)}")
    print(f"Final shape: {df.shape}")
    return df
def build_zuco_v(folder: Path, dataset_label: str, default_task='NR'):
    if not folder.exists(): return pd.DataFrame()
    files=[]
    for ext in ('*.csv','*.tsv','*.txt'): files+=list(folder.rglob(ext))
    print(f"build_zuco_v: Found {len(files)} files in {folder}")
    dfs=[]
    for f in files:
        print(f"build_zuco_v: Processing {f}")
        d=_read_table_any(f)
        print(f"build_zuco_v: Read {len(d)} rows from {f}")
        if not d.empty: 
            result = _ensure_cols(d, dataset_label, default_task, f)
            print(f"build_zuco_v: After _ensure_cols, got {len(result)} rows")
            dfs.append(result)
    if not dfs: return pd.DataFrame()
    out=pd.concat(dfs, ignore_index=True)
    if 'Subject' in out: out['Subject']=out['Subject'].astype(str)
    if 'SentenceID' in out: out['SentenceID']=out['SentenceID'].astype(str)
    return out


In [31]:
df_v1=build_zuco_v(RAW_V1,'v1','NR')
df_v2=build_zuco_v(RAW_V2,'v2')
if not df_v1.empty:
    (PROC/'zuco_v1_aligned.csv').write_text(df_v1.to_csv(index=False)); heartbeat('Saved v1')
if not df_v2.empty:
    (PROC/'zuco_v2_aligned.csv').write_text(df_v2.to_csv(index=False)); heartbeat('Saved v2')
df_all=pd.concat([d for d in [df_v1,df_v2] if not d.empty], ignore_index=True)
if df_all.empty:
    warnings.warn('No data unified.')
else:
    (PROC/'zuco_aligned.csv').write_text(df_all.to_csv(index=False)); heartbeat('Saved unified')


build_zuco_v: Found 4 files in ../data/raw_public/zuco/v1
build_zuco_v: Processing ../data/raw_public/zuco/v1/task3 - TSR/Raw data/relations_task_specific.csv
build_zuco_v: Read 407 rows from ../data/raw_public/zuco/v1/task3 - TSR/Raw data/relations_task_specific.csv
Processing file: ../data/raw_public/zuco/v1/task3 - TSR/Raw data/relations_task_specific.csv
Original columns: ['paragraph_id', 'sentence_id', 'sentence', 'relation_type']
DataFrame shape: (407, 4)
After renaming: ['paragraph_id', 'sentence_id', 'Sentence', 'relation_type']
Processing ZuCo v1 file: ../data/raw_public/zuco/v1/task3 - TSR/Raw data/relations_task_specific.csv
Assigned subject: ZAB (aggregated v1 data)
Processing sentences to extract words...
Sentence 0: 'He won the Gold Medal of the Royal Astronomical Society in 1892, and also later served as president of that organization.' -> 21 words
Sentence 1: 'Dole was twice decorated for heroic achievement, receiving two Purple Hearts for his injuries, and the Bronze S

In [23]:
# Debug: Check file discovery
print(f"Current working directory: {Path.cwd()}")
print(f"RAW_V1 path: {RAW_V1}")
print(f"RAW_V1 exists: {RAW_V1.exists()}")
if RAW_V1.exists():
    files_v1 = []
    for ext in ('*.csv','*.tsv','*.txt'): 
        files_v1 += list(RAW_V1.rglob(ext))
    print(f"Found {len(files_v1)} files in v1:")
    for f in files_v1[:3]:  # Show first 3
        print(f"  {f}")
        
print(f"RAW_V2 path: {RAW_V2}")
print(f"RAW_V2 exists: {RAW_V2.exists()}")
if RAW_V2.exists():
    files_v2 = []
    for ext in ('*.csv','*.tsv','*.txt','*.mat'): 
        files_v2 += list(RAW_V2.rglob(ext))
    print(f"Found {len(files_v2)} files in v2:")
    for f in files_v2[:3]:  # Show first 3
        print(f"  {f}")

Current working directory: /home/agourakis82/workspace/pcs-meta-repo/notebooks
RAW_V1 path: ../data/raw_public/zuco/v1
RAW_V1 exists: True
Found 4 files in v1:
  ../data/raw_public/zuco/v1/task3 - TSR/Raw data/relations_task_specific.csv
  ../data/raw_public/zuco/v1/task1- SR/Raw data/sentiment_normal_reading.csv
  ../data/raw_public/zuco/v1/task1- SR/Preprocessed/sentiment_normal_reading.csv
RAW_V2 path: ../data/raw_public/zuco/v2
RAW_V2 exists: True
Found 534 files in v2:
  ../data/raw_public/zuco/v2/task2 - TSR/Raw data/YFR/YFR_TSR7_ET.mat
  ../data/raw_public/zuco/v2/task2 - TSR/Raw data/YFR/YFR_TSR3_EEG.mat
  ../data/raw_public/zuco/v2/task2 - TSR/Raw data/YFR/YFR_TSR1_ET.mat


In [11]:
df_v1=build_zuco_v(RAW_V1,'v1','NR')
df_v2=build_zuco_v(RAW_V2,'v2')
if not df_v1.empty:
    (PROC/'zuco_v1_aligned.csv').write_text(df_v1.to_csv(index=False)); heartbeat('Saved v1')
if not df_v2.empty:
    (PROC/'zuco_v2_aligned.csv').write_text(df_v2.to_csv(index=False)); heartbeat('Saved v2')
df_all=pd.concat([d for d in [df_v1,df_v2] if not d.empty], ignore_index=True)
if df_all.empty:
    warnings.warn('No data unified.')
else:
    (PROC/'zuco_aligned.csv').write_text(df_all.to_csv(index=False)); heartbeat('Saved unified')

[19:38:50] Saved v1
[19:38:50] Saved unified


In [32]:
qa={}
qa['files']={'zuco_v1_aligned.csv': (PROC/'zuco_v1_aligned.csv').exists(),
             'zuco_v2_aligned.csv': (PROC/'zuco_v2_aligned.csv').exists(),
             'zuco_aligned.csv': (PROC/'zuco_aligned.csv').exists()}
if (PROC/'zuco_aligned.csv').exists():
    df=pd.read_csv(PROC/'zuco_aligned.csv')
    for c in ['Subject','SentenceID','Word','Dataset','Task']:
        if c not in df.columns: df[c]=np.nan
    qa['rows_total']=int(len(df)); qa['subjects']=int(df['Subject'].nunique()) if 'Subject' in df else 0
    qa['by_dataset']=df.groupby('Dataset').size().to_dict() if 'Dataset' in df else {}
    qa['by_task']=df.groupby('Task').size().to_dict() if 'Task' in df else {}
    # coverage vs KEC (if exists)
    kec_path=PROC/'kec'/'metrics_en.csv'
    if kec_path.exists():
        k=pd.read_csv(kec_path)
        for dft in (df,k): dft.columns=[c.strip() for c in dft.columns]
        df['token_norm']=df['Word'].astype(str).str.lower().str.replace(r'[\W_]+','',regex=True)
        k['token_norm']=k['word'].astype(str).str.lower().str.replace(r'[\W_]+','',regex=True) if 'word' in k.columns else k['node'].astype(str).str.lower().str.replace(r'[\W_]+','',regex=True)
        # Ensure both token_norm columns are strings
        df['token_norm'] = df['token_norm'].astype(str)
        k['token_norm'] = k['token_norm'].astype(str)
        m=df.merge(k[['token_norm','entropy','avg_curvature','coherence']], on='token_norm', how='left')
        qa['kec_coverage_pct']={c: float(m[c].notna().mean()*100.0) for c in ['entropy','avg_curvature','coherence']}
    else:
        qa['kec_coverage_pct']=None
import json
(Path('../reports')/'zuco_loader_qa.json').write_text(json.dumps(qa, indent=2))
heartbeat('Saved QA')


  df=pd.read_csv(PROC/'zuco_aligned.csv')


[19:46:33] Saved QA


In [27]:
# Debug test cell
print("=== DEBUG TEST START ===")
test_file = Path('../data/raw_public/zuco/v1/task1- SR/Preprocessed/sentiment_normal_reading.csv')
print(f"Test file: {test_file}")
print(f"Test file exists: {test_file.exists()}")

if test_file.exists():
    print("Reading test file...")
    df_test = _read_table_any(test_file)
    print(f"Read {len(df_test)} rows")
    print(f"Columns: {list(df_test.columns)}")
    print("Sample data:")
    print(df_test.head(3))
    
    print("\nTesting _ensure_cols...")
    result = _ensure_cols(df_test, 'v1', 'NR', test_file)
    print(f"After processing: {len(result)} rows")
    print(f"Final columns: {list(result.columns)}")
    
    if len(result) > 0:
        print("Sample result:")
        print(result.head(3))
        
    if 'Subject' in result.columns:
        print(f"Unique subjects: {result['Subject'].unique()}")
    if 'Word' in result.columns:
        print(f"Sample words: {result['Word'].head(5).tolist()}")
else:
    print("Test file not found!")

print("=== DEBUG TEST END ===")

=== DEBUG TEST START ===
Test file: ../data/raw_public/zuco/v1/task1- SR/Preprocessed/sentiment_normal_reading.csv
Test file exists: True
Reading test file...
Read 600 rows
Columns: ['ID', 'sentence', 'control']
Sample data:
   ID                                           sentence  control
0   1  One of the best silly horror movies of recent ...      NaN
1   2  Jason Patric and Ray Liotta make for one splen...      NaN
2  13  A real winner -- smart, funny, subtle, and res...  CONTROL

Testing _ensure_cols...
Processing file: ../data/raw_public/zuco/v1/task1- SR/Preprocessed/sentiment_normal_reading.csv
Original columns: ['ID', 'sentence', 'control']
DataFrame shape: (600, 3)
After renaming: ['SentenceID', 'Sentence', 'control']
Processing ZuCo v1 file: ../data/raw_public/zuco/v1/task1- SR/Preprocessed/sentiment_normal_reading.csv
No subject found in path
Processing sentences to extract words...
Sentence 0: 'One of the best silly horror movies of recent memory, with some real shocks in 