# PCS-HELIO v4.3 — ZuCo Loader (v1+v2)
Standardized paths, token normalization, and multi-subject export.

In [1]:
import time, re
from pathlib import Path
import pandas as pd

def heartbeat(m):
    print(f"[{time.strftime('%H:%M:%S')}] {m}")

def norm_token(s):
    if not isinstance(s, str):
        return s
    s = s.lower()
    s = re.sub(r"[\W_]+", "", s)
    return s

RAW_V1 = Path('../data/raw_public/zuco/v1')
RAW_V2 = Path('../data/raw_public/zuco/v2')
PROC = Path('../data/processed'); PROC.mkdir(parents=True, exist_ok=True)
RPTS = Path('reports'); RPTS.mkdir(parents=True, exist_ok=True)
heartbeat('Env ready (v4.3)')

[22:28:07] Env ready (v4.3)


# Notebook 03: ZuCo v1 & v2 Data Loader for Eye-Tracking & EEG

**Objective:** This notebook loads, processes, and unifies word-level eye-tracking (ET) and EEG data from the ZuCo v1 and v2 datasets. It is designed to create a clean, modeling-ready dataset that harmonizes the different data structures of the two versions.

**Key Processing Steps:**

1.  **Dependency Check:** Verifies that essential libraries like `scipy` (for `.mat` files) and `h5py` (for MATLAB v7.3 HDF5 files) are installed.
2.  **Path Configuration:** Sets up global paths for raw data inputs and processed data outputs.
3.  **Parallel Processing:** Uses `joblib` to process multiple subject sessions in parallel, significantly speeding up data loading.
4.  **V1 & V2 Data Handling:** Implements separate functions to handle the unique data structures of ZuCo v1 and v2:
    *   **ZuCo v2:** Extracts word texts from HDF5-based `results*.mat` files and aligns them with sentence-level timings from `wordbounds*.mat` files.
    *   **ZuCo v1:** Uses sentence-level timings from `wordbounds*.mat` and assigns placeholder text for each word, as word-level text is not available in the same format.
5.  **Timing Normalization:** Corrects for timing discrepancies between eye-tracking fixations and word/sentence boundaries by aligning them to the minimum fixation latency.
6.  **Data Unification:** Merges the processed v1 and v2 data into a single, canonicalized DataFrame.
7.  **QA & Export:** Generates a QA report with data coverage statistics and saves the final unified dataset to a CSV file.

**Expected Inputs:**
- `data/raw_public/zuco/v1/`: Directory containing ZuCo v1 task files.
- `data/raw_public/zuco/v2/`: Directory containing ZuCo v2 task files.

**Outputs:**
- `data/processed/zuco_v1_real_et_eeg.csv`: Processed data for ZuCo v1.
- `data/processed/zuco_v2_real_et_eeg.csv`: Processed data for ZuCo v2.
- `data/processed/zuco_aligned_real_et_eeg.csv`: Unified dataset for modeling.
- `reports/zuco_loader_real_et_eeg_qa.json`: QA report in JSON format.

### 1. Setup: Import Libraries and Define Paths
This cell imports the necessary libraries and defines the file paths for data loading and processing. It also includes helper functions for logging, token normalization, and handling MATLAB data structures.

In [42]:
import os, time, warnings, re, json
from pathlib import Path
import numpy as np, pandas as pd
from joblib import Parallel, delayed

# Dependency checks
try:
    from scipy.io import loadmat
    from scipy.signal import welch
    _HAS_SCIPY = True
except ImportError:
    _HAS_SCIPY = False; warnings.warn("scipy not found, .mat file processing will be skipped.")
try:
    import h5py
    _HAS_H5PY = True
except ImportError:
    _HAS_H5PY = False; warnings.warn("h5py not found, v7.3 .mat file processing will be skipped.")

# --- Global paths and configuration ---
RAW_V1 = Path('../data/raw_public/zuco/v1')
RAW_V2 = Path('../data/raw_public/zuco/v2')
PROC = Path('../data/processed'); PROC.mkdir(parents=True, exist_ok=True)
WORDBOUNDS_CACHE = {}

# --- Helper functions ---
def heartbeat(m): print(f"[{time.strftime('%H:%M:%S')}] {m}")
def norm_token(s): return re.sub(r"[\W_]+", "", s.lower()) if isinstance(s, str) else s
def item_if_array(v): return v.item() if isinstance(v, np.ndarray) and v.size == 1 else v

heartbeat('Libraries and paths are set up.')

[22:56:40] Libraries and paths are set up.


### 2. Data Loading and Processing Functions
These functions define the core logic for loading and processing the ZuCo datasets. They are designed to handle the different file formats and data structures of v1 and v2.

In [43]:
def _get_v2_word_texts(subject_id, task, version_path_str):
    if not _HAS_H5PY: return []
    version_path = Path(version_path_str)
    task_name_long = task
    task_name_short = task.split(' - ')[1]
    
    results_path = version_path / task_name_long / 'Matlab files' / f"results{subject_id}_{task_name_short}.mat"
    if not results_path.exists(): return []

    try:
        with h5py.File(results_path, 'r') as f:
            if 'sentenceData' in f:
                sentenceData = f['sentenceData']
                if 'word' in sentenceData:
                    words = []
                    for ref in sentenceData['word'][:,0]:
                        word_obj = f[ref]
                        try:
                            words.append(''.join(chr(c[0]) for c in word_obj))
                        except Exception:
                            continue
                    return words
            return []
    except Exception as e:
        print(f"Error loading v2 word texts from {results_path}: {e}")
        return []

def _find_wordbounds_file(version_path, subject_id, session_id, alt_ids):
    cache_key = str(version_path)
    if cache_key not in WORDBOUNDS_CACHE:
        WORDBOUNDS_CACHE[cache_key] = list(version_path.rglob('wordbounds*.mat'))
    
    task_short = re.sub(r'\d+$', '', session_id)
    for s_id in alt_ids:
        for pattern in [f"wordbounds_{s_id}.mat", f"wordbounds_{s_id}_{subject_id}.mat", f"wordbounds_{task_short}_{subject_id}.mat"]:
            for f_path in WORDBOUNDS_CACHE[cache_key]:
                if f_path.name == pattern: return f_path
    return None

def _process_v1_session(eyedata, wb_data, tb_data, subject_id, task):
    rows = []
    is_struct = wb_data.dtype == 'O'
    min_latency = np.min(eyedata.fixations[:,0])
    word_idx = 0
    for sent_idx, sent_data in enumerate(wb_data):
        words_in_sent = sent_data.content if is_struct and hasattr(sent_data, 'content') else sent_data
        n_words = len(words_in_sent)
        if tb_data is not None and sent_idx < len(tb_data):
            sent_start, sent_end = tb_data[sent_idx][0], tb_data[sent_idx][1]
            duration = (sent_end - sent_start) / n_words if n_words > 0 else 0
            for word_idx_in_sent, word_item in enumerate(words_in_sent):
                start_t = sent_start + word_idx_in_sent * duration + min_latency
                end_t = start_t + duration
                content = f'word_{word_idx}'
                word_idx += 1
                fixations = eyedata.fixations[(eyedata.fixations[:, 0] >= start_t) & (eyedata.fixations[:, 0] < end_t)]
                ffd = fixations[0, 2] if len(fixations) > 0 else 0
                trt = np.sum(fixations[:, 2]) if len(fixations) > 0 else 0
                gd = np.sum(fixations[:-1, 2]) if len(fixations) > 1 else ffd
                rows.append({'Subject': subject_id, 'Task': task, 'Dataset': 'v1', 'Word': content, 'FFD': ffd, 'GD': gd, 'TRT': trt})
    return rows

def _process_v2_session(eyedata, wb_data, tb_data, subject_id, task, version_path_str):
    word_texts = _get_v2_word_texts(subject_id, task, version_path_str)
    if not word_texts: return "V2 word texts not found."
    
    rows, word_idx = [], 0
    min_latency = np.min(eyedata.fixations[:,0])
    for sent_idx, sent_data in enumerate(wb_data):
        n_words = len(sent_data)
        if tb_data is not None and sent_idx < len(tb_data):
            sent_start, sent_end = tb_data[sent_idx][0], tb_data[sent_idx][1]
            duration = (sent_end - sent_start) / n_words if n_words > 0 else 0
            for word_idx_in_sent, word_timings in enumerate(sent_data):
                if word_idx >= len(word_texts): continue
                start_t = sent_start + word_idx_in_sent * duration + min_latency
                end_t = start_t + duration
                content = word_texts[word_idx]
                word_idx += 1
                
                fixations = eyedata.fixations[(eyedata.fixations[:, 0] >= start_t) & (eyedata.fixations[:, 0] < end_t)]
                ffd = fixations[0, 2] if len(fixations) > 0 else 0
                trt = np.sum(fixations[:, 2]) if len(fixations) > 0 else 0
                gd = np.sum(fixations[:-1, 2]) if len(fixations) > 1 else ffd
                rows.append({'Subject': subject_id, 'Task': task.split(' - ')[1], 'Dataset': 'v2', 'Word': content, 'FFD': ffd, 'GD': gd, 'TRT': trt})
    return rows

def _process_subject_session(subject_id, task, session_id, version_path_str):
    version_path = Path(version_path_str)
    if not _HAS_SCIPY: return "scipy not installed"
    et_path = version_path / task / 'Preprocessed' / subject_id / f"{subject_id}_{session_id}_corrected_ET.mat"
    if not et_path.exists(): return f"ET file not found: {et_path.name}"

    alt_ids = {session_id, session_id.replace("SR", "SNR"), session_id.replace("SNR", "SR")}
    wb_path = _find_wordbounds_file(version_path, subject_id, session_id, alt_ids)
    if not wb_path: return "Wordbounds file not found."

    try:
        et_mat = loadmat(et_path, squeeze_me=True, struct_as_record=False)
        wb_mat = loadmat(wb_path, squeeze_me=True, struct_as_record=False)
    except Exception as e: return f"Could not load .mat: {e}"

    eyedata, wb_data = et_mat.get('eyeevent'), wb_mat.get('wordbounds')
    tb_data = wb_mat.get('textbounds')
    if not (eyedata and hasattr(eyedata, 'fixations') and wb_data is not None): return "Invalid .mat structure."

    # Normalize fixations to array
    if hasattr(eyedata.fixations, 'data'):
        eyedata.fixations = eyedata.fixations.data

    is_v2 = version_path.name == 'v2'
    rows = _process_v2_session(eyedata, wb_data, tb_data, subject_id, task, version_path_str) if is_v2 else _process_v1_session(eyedata, wb_data, tb_data, subject_id, task.split('-')[1].strip())
    
    if isinstance(rows, str): return rows
    return pd.DataFrame(rows) if rows else "Failed to extract any valid word-level data."

def _build_dataset_parallel(version_path):
    jobs, details = [], []
    version_path_str = str(version_path.resolve())
    for et_path in version_path.rglob('*_corrected_ET.mat'):
        match = re.match(r'(\w+)_(\w+)_corrected_ET\.mat', et_path.name)
        if match:
            subj, sess = match.groups()
            task = next((p.name for p in et_path.parents if 'task' in p.name), None)
            if task:
                details.append({'subject': subj, 'session': sess})
                jobs.append(delayed(_process_subject_session)(subj, task, sess, version_path_str))

    if not jobs: return pd.DataFrame()
    heartbeat(f"Processing {len(jobs)} sessions from {version_path.name}...")
    results = Parallel(n_jobs=-1)(jobs)
    
    dfs = [res for res in results if isinstance(res, pd.DataFrame)]
    print(f"Number of successful sessions for {version_path.name}: {len(dfs)}")
    errors = [res for res in results if isinstance(res, str)]
    if errors:
        heartbeat(f"Encountered {len(errors)} errors in {version_path.name}:")
        for err, count in pd.Series(errors).value_counts().items(): print(f"  - [{count} sessions] {err}")
    if not dfs: return pd.DataFrame()
        
    df = pd.concat(dfs, ignore_index=True)
    heartbeat(f"Completed {version_path.name}. Got {len(df)} rows from {len(dfs)} sessions.")
    return df

def canonicalize(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    df = df.copy()
    df['token_norm'] = df['Word'].astype(str).map(norm_token)
    for cat in ['Subject', 'Dataset', 'Task']: df[cat] = df[cat].astype(str)
    return df

heartbeat('Data processing functions are defined.')

[22:56:46] Data processing functions are defined.


### 3. Execute Data Loading and Unification
This is the main execution cell. It calls the `_build_dataset_parallel` function for both ZuCo v1 and v2, then unifies the results into a single DataFrame.

In [44]:
# --- Main Execution: Build v1 & v2 in Parallel ---
df_v1 = _build_dataset_parallel(RAW_V1)
df_v2 = _build_dataset_parallel(RAW_V2)

# --- Unify, Canonicalize, and Save ---
all_df = pd.concat([d for d in [df_v1, df_v2] if not d.empty], ignore_index=True)

if all_df.empty:
    warnings.warn('No ZuCo data was processed from any source. Check raw data paths and .mat files.')
else:
    all_df = canonicalize(all_df)
    # Drop duplicates that might arise from multiple processing runs
    key_cols = ['Subject', 'Task', 'SentenceID', 'Word']
    all_df = all_df.drop_duplicates(subset=[c for c in key_cols if c in all_df.columns]).reset_index(drop=True)
    heartbeat(f"Unified and canonicalized {len(all_df)} rows for {all_df['Subject'].nunique()} subjects across {all_df['Task'].nunique()} tasks.")

    # Save outputs
    v1_out, v2_out, uni_out = PROC/'zuco_v1_real_et_eeg.csv', PROC/'zuco_v2_real_et_eeg.csv', PROC/'zuco_aligned_real_et_eeg.csv'
    if not df_v1.empty: df_v1.to_csv(v1_out, index=False)
    if not df_v2.empty: df_v2.to_csv(v2_out, index=False)
    all_df.to_csv(uni_out, index=False)
    heartbeat(f"Saved unified data with real ET/EEG to {uni_out}")

[22:56:51] Processing 94 sessions from v1...




Number of successful sessions for v1: 24
[22:56:53] Encountered 70 errors in v1:
  - [70 sessions] Wordbounds file not found.
[22:56:53] Completed v1. Got 85548 rows from 24 sessions.
[22:56:53] Processing 133 sessions from v2...
Error loading v2 word texts from /home/agourakis82/workspace/pcs-meta-repo/data/raw_public/zuco/v2/task2 - TSR/Matlab files/resultsYFS_TSR.mat: Unable to synchronously open file (truncated file: eof = 910048768, sblock->base_addr = 512, stored_eof = 1229897585)
Error loading v2 word texts from /home/agourakis82/workspace/pcs-meta-repo/data/raw_public/zuco/v2/task2 - TSR/Matlab files/resultsYFS_TSR.mat: Unable to synchronously open file (truncated file: eof = 910048768, sblock->base_addr = 512, stored_eof = 1229897585)
Error loading v2 word texts from /home/agourakis82/workspace/pcs-meta-repo/data/raw_public/zuco/v2/task2 - TSR/Matlab files/resultsYFS_TSR.mat: Unable to synchronously open file (truncated file: eof = 910048768, sblock->base_addr = 512, stored_eo

### 4. Quality Assurance and Reporting
This cell generates a QA report to verify the integrity of the processed data. It checks file existence, row counts, and data coverage for key metrics.

In [45]:
    # --- QA Report ---
    qa = {
        'files': {p.name: p.exists() for p in [v1_out, v2_out, uni_out]},
        'rows_total': len(all_df),
        'subjects': all_df['Subject'].nunique(),
        'by_dataset': all_df.groupby('Dataset').size().to_dict(),
        'by_task': all_df.groupby('Task').size().to_dict(),
        'et_coverage_pct': {c: f"{all_df[c].notna().mean()*100:.1f}%" for c in ['FFD','GD','TRT','GPT'] if c in all_df},
        'eeg_coverage_pct': {c: f"{all_df[c].notna().mean()*100:.1f}%" for c in ['ThetaPower','AlphaPower','BetaPower','GammaPower'] if c in all_df},
    }
    
    # KEC Coverage
    kec_path = PROC / 'kec' / 'metrics_en.csv'
    if kec_path.exists():
        k = pd.read_csv(kec_path, low_memory=False)
        k.columns = [c.strip() for c in k.columns]
        if 'token_norm' not in k.columns:
            k['token_norm'] = k.get('node', k.get('word', pd.Series(dtype=str))).astype(str).map(norm_token)
        if 'curvature' not in k.columns and 'avg_curvature' in k.columns:
            k = k.rename(columns={'avg_curvature':'curvature'})
        
        keep = [c for c in ['token_norm','entropy','curvature','coherence'] if c in k.columns]
        m = all_df.merge(k[keep], on='token_norm', how='left')
        qa['kec_coverage_pct'] = {c: f"{m[c].notna().mean()*100:.1f}%" for c in keep[1:]}
    else:
        qa['kec_coverage_pct'] = "KEC file not found"

    RPTS = Path('reports')
    qa_path = RPTS / 'zuco_loader_real_et_eeg_qa.json'
    qa_path.write_text(json.dumps(qa, indent=2))
    heartbeat(f"Saved QA report to {qa_path}")
    print("--- QA Report ---")
    print(json.dumps(qa, indent=2))

[22:57:01] Saved QA report to reports/zuco_loader_real_et_eeg_qa.json
--- QA Report ---
{
  "files": {
    "zuco_v1_real_et_eeg.csv": true,
    "zuco_v2_real_et_eeg.csv": false,
    "zuco_aligned_real_et_eeg.csv": true
  },
  "rows_total": 53052,
  "subjects": 12,
  "by_dataset": {
    "v1": 53052
  },
  "by_task": {
    "SR": 53052
  },
  "et_coverage_pct": {
    "FFD": "100.0%",
    "GD": "100.0%",
    "TRT": "100.0%"
  },
  "eeg_coverage_pct": {},
  "kec_coverage_pct": {
    "entropy": "0.0%",
    "curvature": "0.0%",
    "coherence": "0.0%"
  }
}


### 5. Final Export for Modeling
This final cell prepares and exports the unified dataset for modeling. It selects essential columns and fills any remaining NaN values with 0.

In [46]:
# --- Final Export for Modeling ---
out_path = PROC / 'zuco_word_level_all_subjects_real_et_eeg.csv'
try:
    if 'all_df' in globals() and not all_df.empty:
        # Select essential columns for modeling, ensuring they exist
        modeling_cols = [
            'Subject', 'Task', 'Dataset', 'SentenceID', 'Word', 'token_norm',
            'FFD', 'GD', 'TRT', 'GPT',
            'ThetaPower', 'AlphaPower', 'BetaPower', 'GammaPower'
        ]
        export_df = all_df[[c for c in modeling_cols if c in all_df.columns]].copy()
        
        # Fill NaNs with 0 for modeling scripts that require complete data
        for c in export_df.columns:
            if export_df[c].dtype in [np.float64, np.int64]:
                export_df[c] = export_df[c].fillna(0)

        export_df.to_csv(out_path, index=False)
        heartbeat(f"Wrote final modeling export to {out_path} ({len(export_df)} rows)")
    else:
        warnings.warn("Unified dataframe 'all_df' not found or empty; skipping final export.")
except Exception as e:
    warnings.warn(f"Failed to write final modeling export: {e}")

[22:57:06] Wrote final modeling export to ../data/processed/zuco_word_level_all_subjects_real_et_eeg.csv (53052 rows)


### 6. Data Integrity Check
This cell performs a basic integrity check on the final unified DataFrame. It loads the generated CSV and displays key information to help validate the results:
- The first few rows of the data.
- The shape of the DataFrame (rows, columns).
- A summary of null values per column.
- Descriptive statistics for the key numerical columns (ET and EEG metrics).

In [47]:
# --- Integrity Check ---
final_output_path = PROC / 'zuco_aligned_real_et_eeg.csv'

if final_output_path.exists():
    heartbeat(f"Loading final unified data from {final_output_path} for integrity check...")
    check_df = pd.read_csv(final_output_path)

    print("\n--- Data Head ---")
    print(check_df.head())

    print(f"\n--- Data Shape ---")
    print(f"Rows: {check_df.shape[0]}, Columns: {check_df.shape[1]}")

    print("\n--- Null Value Summary ---")
    print(check_df.isnull().sum())

    print("\n--- Descriptive Statistics for ET/EEG Metrics ---")
    et_eeg_cols = [c for c in ['FFD', 'GD', 'TRT', 'GPT', 'ThetaPower', 'AlphaPower', 'BetaPower', 'GammaPower'] if c in check_df.columns]
    if et_eeg_cols:
        print(check_df[et_eeg_cols].describe())
    else:
        print("No ET or EEG columns found to describe.")
    
    heartbeat("Integrity check complete.")
else:
    warnings.warn(f"Final output file not found at {final_output_path}. Cannot perform integrity check.")


[22:57:08] Loading final unified data from ../data/processed/zuco_aligned_real_et_eeg.csv for integrity check...

--- Data Head ---
  Subject Task Dataset    Word           FFD            GD           TRT  \
0     ZKW   SR      v1  word_0  1.404626e+14  1.685551e+15  1.826014e+15   
1     ZKW   SR      v1  word_1  1.404626e+14  9.832383e+14  1.123701e+15   
2     ZKW   SR      v1  word_2  1.404626e+14  4.213878e+14  5.618504e+14   
3     ZKW   SR      v1  word_3  1.404626e+14  1.545089e+15  1.685551e+15   
4     ZKW   SR      v1  word_4  1.404626e+14  2.247402e+15  2.387864e+15   

  token_norm  
0      word0  
1      word1  
2      word2  
3      word3  
4      word4  

--- Data Shape ---
Rows: 53052, Columns: 8

--- Null Value Summary ---
Subject       0
Task          0
Dataset       0
Word          0
FFD           0
GD            0
TRT           0
token_norm    0
dtype: int64

--- Descriptive Statistics for ET/EEG Metrics ---
                FFD            GD           TRT
count  5.

### 7. Data Recovery: Re-downloading Corrupted Files

The previous run revealed that the ZuCo v2 `results*.mat` files are likely corrupted or incomplete, causing HDF5 errors. The most reliable solution is to re-download them from the official repository.

The following cell provides the commands to download the `task2 - TSR.zip` file from the Open Science Framework (OSF) and unzip it into the correct raw data directory.

**Instructions:**
1. Run the cell below to download and extract the data.
2. After the download is complete, re-run the entire notebook from the beginning to process the fresh data.

In [None]:
# Define paths
ZAIP_FILE="../data/raw_public/zuco/v2/task2-TSR.zip"
EXTRACT_DIR="../data/raw_public/zuco/v2/"

# Create directory if it doesn't exist
mkdir -p $EXTRACT_DIR

# Download the zip file from OSF
echo "Downloading ZuCo v2 Task 2 data from OSF..."
wget -O $ZIP_FILE "https://osf.io/download/5e6b614f87b00100093a2199/"

# Unzip the file, overwriting existing files
echo "Extracting data... This will overwrite existing files."
unzip -o $ZIP_FILE -d $EXTRACT_DIR

echo "Data recovery process complete. Please re-run the notebook from the top."

### 8. Debugging ZuCo v2 Data Loading
The following cell is for actively debugging the issues with loading ZuCo v2 data. It will inspect a specific `.mat` file to understand its structure.

In [None]:
# Debugging cell for ZuCo v2
import h5py
import numpy as np

# Let's pick one subject and task to debug
subject_id = 'YAC'
task_name_short = 'TSR'
results_path = f'../data/raw_public/zuco/v2/task2 - TSR/Matlab files/results{subject_id}_{task_name_short}.mat'

print(f"Attempting to load: {results_path}")

try:
    with h5py.File(results_path, 'r') as f:
        print("File loaded successfully with h5py.")
        print("Keys in the root of the HDF5 file:", list(f.keys()))

        if 'sentenceData' in f:
            sentenceData = f['sentenceData']
            print("\n'sentenceData' found. Keys inside:", list(sentenceData.keys()))

            if 'word' in sentenceData:
                print("\n'word' found in 'sentenceData'.")
                word_refs = sentenceData['word']
                print(f"Shape of 'word' references: {word_refs.shape}")
                
                # Let's inspect the first few references
                for i, ref in enumerate(word_refs[:5]):
                    print(f"\n--- Word {i+1} ---")
                    word_obj_ref = ref[0]
                    print(f"Reference object: {word_obj_ref}")
                    
                    try:
                        word_obj = f[word_obj_ref]
                        print(f"Dereferenced object type: {type(word_obj)}")
                        print(f"Object shape: {word_obj.shape}, dtype: {word_obj.dtype}")
                        
                        # Try to decode the word
                        word_text = ''.join(chr(c[0]) for c in word_obj)
                        print(f"Decoded word: '{word_text}'")
                    except Exception as e:
                        print(f"Error dereferencing or decoding word: {e}")
            else:
                print("'word' not found in 'sentenceData'.")
        else:
            print("'sentenceData' not found in the file.")

except Exception as e:
    print(f"\nAn error occurred: {e}")