In [1]:
# Setup: install optional deps and print versions, then import loader
import sys, subprocess, importlib, platform
from pathlib import Path

REQS = ['pandas', 'numpy', 'joblib', 'scipy', 'h5py']
for pkg in REQS:
    try:
        importlib.import_module(pkg)
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', pkg], check=False)

import pandas as pd, numpy as np
import joblib, scipy, h5py
print(f"Python {platform.python_version()}")
print("pandas", pd.__version__)
print("numpy", np.__version__)
print("joblib", joblib.__version__)
print("scipy", scipy.__version__)
print("h5py", h5py.__version__)

# Paths (relative to repo)
BASE = Path.cwd()
RAW = BASE.parent / 'data' / 'raw_public' / 'zuco'
PROC = BASE.parent / 'data' / 'processed'
RPTS = BASE / 'reports'
PROC.mkdir(parents=True, exist_ok=True)
RPTS.mkdir(parents=True, exist_ok=True)

# Import loader
import importlib.util, sys
loader_path = BASE.parent / 'tools' / 'zuco_loader.py'
spec = importlib.util.spec_from_file_location('zuco_loader', loader_path)
zuco_loader = importlib.util.module_from_spec(spec)
sys.modules['zuco_loader'] = zuco_loader
assert spec and spec.loader
spec.loader.exec_module(zuco_loader)
print("Loaded zuco_loader from", loader_path)


Python 3.12.11
pandas 2.3.2
numpy 1.26.4
joblib 1.5.2
scipy 1.13.1
h5py 3.14.0
Loaded zuco_loader from /home/agourakis82/workspace/pcs-meta-repo/tools/zuco_loader.py
Loaded zuco_loader from /home/agourakis82/workspace/pcs-meta-repo/tools/zuco_loader.py


### Discover files under data/raw_public/zuco/{v1,v2}

In [2]:
disc = zuco_loader.discover(RAW)
print({k: {'et_mat': len(v.et_mat), 'eeg_tabular': len(v.eeg_tabular)} for k,v in disc.items()})
for ver, fd in disc.items():
    print(f"\n[{ver}] ET .mat files (up to 5 shown):")
    for p in fd.et_mat[:5]: print(" -", p.relative_to(RAW))
    print(f"[{ver}] EEG tabular files (up to 5 shown):")
    for p in fd.eeg_tabular[:5]: print(" -", p.relative_to(RAW))

{'v1': {'et_mat': 202, 'eeg_tabular': 4}, 'v2': {'et_mat': 133, 'eeg_tabular': 0}}

[v1] ET .mat files (up to 5 shown):
 - v1/Preprocessed/ZAB/ZAB_TSR1_corrected_ET.mat
 - v1/Preprocessed/ZAB/ZAB_TSR2_corrected_ET.mat
 - v1/Preprocessed/ZAB/ZAB_TSR3_corrected_ET.mat
 - v1/Preprocessed/ZAB/ZAB_TSR4_corrected_ET.mat
 - v1/Preprocessed/ZAB/ZAB_TSR5_corrected_ET.mat
[v1] EEG tabular files (up to 5 shown):
 - v1/Preprocessed/relations_task_specific.csv
 - v1/task1- SR/Preprocessed/sentiment_normal_reading.csv
 - v1/task1- SR/Raw data/sentiment_normal_reading.csv
 - v1/task3 - TSR/Raw data/relations_task_specific.csv

[v2] ET .mat files (up to 5 shown):
 - v2/task2 - TSR/Preprocessed/YAC/YAC_TSR1_corrected_ET.mat
 - v2/task2 - TSR/Preprocessed/YAC/YAC_TSR2_corrected_ET.mat
 - v2/task2 - TSR/Preprocessed/YAC/YAC_TSR3_corrected_ET.mat
 - v2/task2 - TSR/Preprocessed/YAC/YAC_TSR4_corrected_ET.mat
 - v2/task2 - TSR/Preprocessed/YAC/YAC_TSR5_corrected_ET.mat
[v2] EEG tabular files (up to 5 shown):

### Load ET and EEG tables and aggregate EEG per canonical keys

In [10]:
import importlib, sys
# Ensure zuco_loader has a proper spec before reload
try:
    zuco_loader
except NameError:
    import importlib.util
    loader_path = BASE.parent / 'tools' / 'zuco_loader.py'
    spec = importlib.util.spec_from_file_location('zuco_loader', loader_path)
    zuco_loader = importlib.util.module_from_spec(spec)
    sys.modules['zuco_loader'] = zuco_loader
    assert spec and spec.loader
    spec.loader.exec_module(zuco_loader)
else:
    if getattr(zuco_loader, '__spec__', None) is None:
        import importlib.util
        loader_path = BASE.parent / 'tools' / 'zuco_loader.py'
        spec = importlib.util.spec_from_file_location('zuco_loader', loader_path)
        zuco_loader.__spec__ = spec

import numpy as np, pandas as pd
import importlib
zuco_loader = importlib.reload(zuco_loader)

df_et = zuco_loader.load_et(RAW, n_jobs=1)
print("ET rows:", len(df_et))

# Load and aggregate EEG
agg_eeg = zuco_loader.load_eeg(RAW)
print("EEG rows (aggregated):", len(agg_eeg))

# Basic validations
required_keys = ['Dataset','Task','Subject','SentenceID','w_pos','token_norm']
for name, df in [('ET', df_et), ('EEG', agg_eeg)]:
    missing = [c for c in required_keys if c not in df.columns]
    if missing:
        print(f"WARN: {name} missing columns: {missing}")

ModuleNotFoundError: spec not found for the module 'zuco_loader'

### Merge ET + EEG and write outputs

In [4]:
# Merge ET + EEG, write outputs, QA, preview
import json
merged = zuco_loader.merge_et_eeg(df_et, agg_eeg)

# Enforce canonical columns order
col_order = ['Dataset','Task','Subject','SentenceID','w_pos','token','token_norm','FFD','GD','TRT','GPT','theta1','alpha1','beta1','gamma1']
for c in col_order:
    if c not in merged.columns:
        merged[c] = np.nan
merged = merged[col_order]

# Save outputs
out_csv = PROC / 'zuco_aligned.csv'
merged.to_csv(out_csv, index=False)
print('Saved:', out_csv)

# Build QA
qa = zuco_loader.qa_summary(merged, disc, errors={'v1': [], 'v2': []})
qa_path = RPTS / 'zuco_loader_qa.json'
qa_path.write_text(json.dumps(qa, indent=2))
print('QA saved:', qa_path)

# Preview
print('\n--- Preview (10 rows) ---')
print(merged.head(10))

# QA summary
print('\n--- QA Summary ---')
print(json.dumps({k: qa[k] for k in ['rows','subjects','by_dataset','by_task','et_coverage_pct','eeg_coverage_pct','files']}, indent=2))

Saved: /home/agourakis82/workspace/pcs-meta-repo/data/processed/zuco_aligned.csv
QA saved: /home/agourakis82/workspace/pcs-meta-repo/notebooks/reports/zuco_loader_qa.json

--- Preview (10 rows) ---
  Dataset Task Subject  SentenceID  w_pos token token_norm           FFD  \
0      v1  TSR     ZAB           1      1  None       None  1.402865e+14   
1      v1  TSR     ZAB           1      2  None       None  1.402865e+14   
2      v1  TSR     ZAB           1      3  None       None  1.402865e+14   
3      v1  TSR     ZAB           1      4  None       None  1.402865e+14   
4      v1  TSR     ZAB           1      5  None       None  1.402865e+14   
5      v1  TSR     ZAB           1      6  None       None  1.402865e+14   
6      v1  TSR     ZAB           1      7  None       None  1.402865e+14   
7      v1  TSR     ZAB           1      8  None       None  1.402865e+14   
8      v1  TSR     ZAB           1      9  None       None  1.402865e+14   
9      v1  TSR     ZAB           1     10 

### Fetch missing ZuCo files from OSF (conditional)
If required files are missing locally, attempt to download public archives from OSF and extract into `data/raw_public/zuco`.

In [5]:
# Conditional OSF fetch for ZuCo v2 results/ET if missing
import shutil, subprocess, sys, os
from pathlib import Path

RAW_BASE = RAW
V2_DIR = RAW_BASE / 'v2'
V1_DIR = RAW_BASE / 'v1'
V2_DIR.mkdir(parents=True, exist_ok=True)
V1_DIR.mkdir(parents=True, exist_ok=True)

# Simple checks for presence
need_v2_results = True
need_v2_et = True
for p in V2_DIR.rglob('results*.mat'):
    need_v2_results = False; break
for p in V2_DIR.rglob('*_corrected_ET.mat'):
    need_v2_et = False; break

print({'need_v2_results': need_v2_results, 'need_v2_et': need_v2_et})

# Try OSF direct links for task2 - TSR (public). If blocked, print instructions.
osf_tsr_zip = 'https://osf.io/download/5e6b614f87b00100093a2199/'
zip_path = V2_DIR / 'task2-TSR.zip'

if need_v2_results or need_v2_et:
    try:
        # Prefer wget if available
        subprocess.run(['wget', '-O', str(zip_path), osf_tsr_zip], check=False)
        if zip_path.exists() and zip_path.stat().st_size > 0:
            subprocess.run(['unzip', '-o', str(zip_path), '-d', str(V2_DIR)], check=False)
            print('Downloaded and extracted TS R archive from OSF.')
        else:
            print('WARN: Could not download OSF archive automatically. Please download manually and place under data/raw_public/zuco/v2')
    except Exception as e:
        print('Download error:', e)

# Re-run discovery after fetch
disc = zuco_loader.discover(RAW)
print({k: {'et_mat': len(v.et_mat), 'eeg_tabular': len(v.eeg_tabular)} for k,v in disc.items()})

{'need_v2_results': False, 'need_v2_et': False}
{'v1': {'et_mat': 202, 'eeg_tabular': 4}, 'v2': {'et_mat': 133, 'eeg_tabular': 0}}


### Download EEG tabular files from OSF (ZuCo 2.0)
Attempt to locate and download EEG word-level tabular files from the public OSF project for ZuCo 2.0 (project id: 2urht).

In [6]:
# Robust OSF crawl to fetch EEG tabular files from ZuCo 2.0
import importlib
from pathlib import Path

save_dir = RAW / 'v2' / 'eeg'
save_dir.mkdir(parents=True, exist_ok=True)

have_eeg = any(save_dir.rglob('*.csv')) or any(save_dir.rglob('*.tsv'))
print('EEG present under v2/eeg:', have_eeg)

if not have_eeg:
    try:
        osfclient = importlib.import_module('osfclient')
        from osfclient import OSF

        osf = OSF()
        project = osf.project('2urht')  # ZuCo 2.0

        def iter_storage_files(storage_or_folder):
            # Yield all File objects under a storage or folder recursively
            # Storage/Folder in osfclient expose .files() and .folders()
            try:
                files = list(storage_or_folder.files())
            except Exception:
                try:
                    files = list(storage_or_folder.files)
                except Exception:
                    files = []
            for f in files:
                yield f
            # Recurse into folders
            try:
                folders = list(storage_or_folder.folders())
            except Exception:
                try:
                    folders = list(storage_or_folder.folders)
                except Exception:
                    folders = []
            for folder in folders:
                yield from iter_storage_files(folder)

        # Enumerate storages
        storages = []
        try:
            storages = list(project.storages())
        except Exception:
            try:
                storages = list(project.storages)
            except Exception:
                storages = []
        if not storages:
            # Fallback to common slug
            try:
                storages = [project.storage('osfstorage')]
            except Exception:
                storages = []

        downloaded = 0
        for storage in storages:
            for file in iter_storage_files(storage):
                name = file.name.lower()
                if name.endswith('.csv') or name.endswith('.tsv'):
                    # Heuristic: only EEG band/word-level exports
                    if any(k in name for k in ['eeg', 'theta', 'alpha', 'beta', 'gamma', 'word']):
                        out = save_dir / file.name
                        try:
                            with open(out, 'wb') as f:
                                file.write_to(f)
                            downloaded += 1
                        except Exception:
                            continue
        print(f'Downloaded {downloaded} EEG tabular files via osfclient.')
    except Exception as e:
        print('OSF crawl failed. Error:', e)

# Re-run discovery to include new EEG files
disc = zuco_loader.discover(RAW)
print({k: {'et_mat': len(v.et_mat), 'eeg_tabular': len(v.eeg_tabular)} for k,v in disc.items()})

EEG present under v2/eeg: False


Downloaded 0 EEG tabular files via osfclient.
{'v1': {'et_mat': 202, 'eeg_tabular': 4}, 'v2': {'et_mat': 133, 'eeg_tabular': 0}}


In [7]:
# Try EEG fallback from processed file and re-run merge if EEG empty
if 'agg_eeg' in globals() and len(agg_eeg) == 0:
    fb = RAW.parent.parent / 'processed' / 'zuco_word_level_all_subjects.csv'
    if fb.exists():
        df_fb = pd.read_csv(fb, low_memory=False)
        print('EEG fallback file found:', fb)
        # Map to canonical
        band_map = {
            'ThetaPower': 'theta1', 'AlphaPower': 'alpha1', 'BetaPower': 'beta1', 'GammaPower': 'gamma1'
        }
        key_map = {
            'Dataset': 'Dataset', 'Task': 'Task', 'Subject': 'Subject',
            'SentenceID': 'SentenceID', 'Word': 'token', 'token_norm': 'token_norm',
        }
        avail = [c for c in list(key_map) if c in df_fb.columns]
        bands = [c for c in band_map if c in df_fb.columns]
        keep = avail + bands
        eeg_fb = df_fb[keep].rename(columns={**band_map, **key_map})
        # Ensure w_pos is present; if not, set NA
        if 'w_pos' not in eeg_fb.columns:
            eeg_fb['w_pos'] = pd.NA
        # Coerce key types to align with df_et
        for c in ['SentenceID','w_pos']:
            if c in eeg_fb.columns:
                eeg_fb[c] = pd.to_numeric(eeg_fb[c], errors='coerce').astype('Int64')
        for c in ['Dataset','Task','Subject','token','token_norm']:
            if c in eeg_fb.columns:
                eeg_fb[c] = eeg_fb[c].astype(str)
        agg_eeg = eeg_fb[['Dataset','Task','Subject','SentenceID','w_pos','token','token_norm','theta1','alpha1','beta1','gamma1']]
        print('EEG fallback rows:', len(agg_eeg))
        # Re-merge and save
        merged = zuco_loader.merge_et_eeg(df_et, agg_eeg)
        col_order = ['Dataset','Task','Subject','SentenceID','w_pos','token','token_norm','FFD','GD','TRT','GPT','theta1','alpha1','beta1','gamma1']
        for c in col_order:
            if c not in merged.columns:
                merged[c] = np.nan
        merged = merged[col_order]
        out_csv = PROC / 'zuco_aligned.csv'
        merged.to_csv(out_csv, index=False)
        print('Saved with EEG fallback:', out_csv)
        # QA
        qa = zuco_loader.qa_summary(merged, disc, errors={'v1': [], 'v2': []})
        qa_path = RPTS / 'zuco_loader_qa.json'
        qa_path.write_text(json.dumps(qa, indent=2))
        print('QA updated:', qa_path)

EEG fallback file found: /home/agourakis82/workspace/pcs-meta-repo/data/processed/zuco_word_level_all_subjects.csv
EEG fallback rows: 12446
Saved with EEG fallback: /home/agourakis82/workspace/pcs-meta-repo/data/processed/zuco_aligned.csv
QA updated: /home/agourakis82/workspace/pcs-meta-repo/notebooks/reports/zuco_loader_qa.json
