Here‚Äôs a self-contained Python script that does exactly what your 1.5.1‚Äì1.5.3 outline describes:

* **1.5.1** Load raw dataset from `CONFIG["PATHS"]["RAW_DATA"]`
* **1.5.2** Compute file hash and compare against a **version registry**
* **1.5.3** Log schema version info to `dataset_load_log.csv`

You can drop this into something like `scripts/1_5_dataset_loader.py` and tweak paths as needed.

```python
#!/usr/bin/env python
"""
1.5.x Dataset Load & Version Logging

1.5.1 Load Raw Dataset (CSV or Parquet)
1.5.2 Hash / Snapshot Validation (version registry)
1.5.3 Schema Version Logging (dataset_load_log.csv)
"""

from __future__ import annotations

import hashlib
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, Any

import pandas as pd
import yaml  # pip install pyyaml


# -----------------------------------------------------------------------------
# Config loading
# -----------------------------------------------------------------------------

PROJECT_ROOT = Path(__file__).resolve().parents[1]

DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config" / "project_config.yaml"

# Example config structure (for reference):
# PATHS:
#   RAW_DATA: "data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv"
#   ARTIFACTS: "artifacts"


def load_config(path: Path = DEFAULT_CONFIG_PATH) -> Dict[str, Any]:
    if not path.exists():
        raise FileNotFoundError(f"Config file not found at {path}")
    with open(path, "r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)
    return cfg


# -----------------------------------------------------------------------------
# Helper functions
# -----------------------------------------------------------------------------

def compute_file_hash(path: Path, algo: str = "sha256", chunk_size: int = 1 << 20) -> str:
    """Compute a stable hash of the file contents."""
    h = hashlib.new(algo)
    with path.open("rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()


def load_dataset(raw_path: Path) -> pd.DataFrame:
    """Load CSV or Parquet based on file extension."""
    if not raw_path.exists():
        raise FileNotFoundError(f"RAW_DATA file not found at {raw_path}")

    suffix = raw_path.suffix.lower()
    if suffix == ".csv":
        df = pd.read_csv(raw_path)
    elif suffix in {".parquet", ".pq"}:
        df = pd.read_parquet(raw_path)
    else:
        raise ValueError(f"Unsupported file type: {suffix}")
    return df


# -----------------------------------------------------------------------------
# Version registry + load log handling
# -----------------------------------------------------------------------------

def get_artifact_paths(config: Dict[str, Any]) -> Dict[str, Path]:
    artifacts_root = PROJECT_ROOT / config["PATHS"].get("ARTIFACTS", "artifacts")
    artifacts_root.mkdir(parents=True, exist_ok=True)

    registry_path = artifacts_root / "dataset_version_registry.csv"
    load_log_path = artifacts_root / "dataset_load_log.csv"

    return {
        "artifacts_root": artifacts_root,
        "registry_path": registry_path,
        "load_log_path": load_log_path,
    }


def upsert_version_registry(
    registry_path: Path,
    dataset_path: Path,
    file_hash: str,
    n_rows: int,
    n_cols: int,
) -> str:
    """
    1.5.2 Hash / Snapshot Validation

    - Compare current file hash to existing registry.
    - If seen before, reuse version_id.
    - If new, create new version_id and append.
    """
    now = datetime.utcnow().isoformat(timespec="seconds") + "Z"

    if registry_path.exists():
        registry = pd.read_csv(registry_path)
    else:
        registry = pd.DataFrame(columns=[
            "version_id",
            "dataset_path",
            "file_hash",
            "first_seen_utc",
            "last_seen_utc",
            "n_rows",
            "n_cols",
        ])

    # If this hash already exists, reuse its version_id
    existing = registry.loc[registry["file_hash"] == file_hash]

    if not existing.empty:
        version_id = str(existing.iloc[0]["version_id"])
        # Update last_seen_utc
        registry.loc[registry["file_hash"] == file_hash, "last_seen_utc"] = now
    else:
        # New version: increment version_id
        if registry.empty:
            next_id = 1
        else:
            # version_id might be string; coerce to int safely
            max_id = pd.to_numeric(registry["version_id"], errors="coerce").fillna(0).max()
            next_id = int(max_id) + 1

        version_id = str(next_id)
        new_row = pd.DataFrame(
            {
                "version_id": [version_id],
                "dataset_path": [str(dataset_path)],
                "file_hash": [file_hash],
                "first_seen_utc": [now],
                "last_seen_utc": [now],
                "n_rows": [n_rows],
                "n_cols": [n_cols],
            }
        )
        registry = pd.concat([registry, new_row], ignore_index=True)

    # Atomic write
    tmp_path = registry_path.with_suffix(registry_path.suffix + ".tmp")
    registry.to_csv(tmp_path, index=False)
    os.replace(tmp_path, registry_path)

    return version_id


def append_dataset_load_log(
    load_log_path: Path,
    dataset_path: Path,
    version_id: str,
    file_hash: str,
    n_rows: int,
    n_cols: int,
) -> None:
    """
    1.5.3 Schema Version Logging

    Append a single row to dataset_load_log.csv with:
      - timestamp
      - dataset_path
      - version_id
      - file_hash
      - n_rows, n_cols
    """
    now = datetime.utcnow().isoformat(timespec="seconds") + "Z"

    log_row = pd.DataFrame(
        {
            "timestamp_utc": [now],
            "dataset_path": [str(dataset_path)],
            "version_id": [version_id],
            "file_hash": [file_hash],
            "n_rows": [n_rows],
            "n_cols": [n_cols],
        }
    )

    if load_log_path.exists():
        existing = pd.read_csv(load_log_path)
        all_cols = pd.Index(existing.columns).union(log_row.columns)
        out = pd.concat(
            [existing.reindex(columns=all_cols),
             log_row.reindex(columns=all_cols)],
            ignore_index=True,
        )
    else:
        out = log_row

    tmp_path = load_log_path.with_suffix(load_log_path.suffix + ".tmp")
    out.to_csv(tmp_path, index=False)
    os.replace(tmp_path, load_log_path)


# -----------------------------------------------------------------------------
# Main entrypoint
# -----------------------------------------------------------------------------

def main(config_path: Path = DEFAULT_CONFIG_PATH) -> pd.DataFrame:
    # Load config
    config = load_config(config_path)

    raw_path = PROJECT_ROOT / config["PATHS"]["RAW_DATA"]
    paths = get_artifact_paths(config)

    print(f"1.5.1) üì• Loading raw dataset from: {raw_path}")
    df = load_dataset(raw_path)
    n_rows, n_cols = df.shape
    print(f"   ‚Üí Loaded dataset shape: {n_rows} rows √ó {n_cols} cols")

    print("1.5.2) üîê Computing file hash and updating version registry")
    file_hash = compute_file_hash(raw_path, algo="sha256")
    version_id = upsert_version_registry(
        paths["registry_path"],
        raw_path,
        file_hash,
        n_rows,
        n_cols,
    )
    print(f"   ‚Üí Dataset version_id: {version_id}")

    print("1.5.3) üßæ Appending to dataset_load_log.csv")
    append_dataset_load_log(
        paths["load_log_path"],
        raw_path,
        version_id,
        file_hash,
        n_rows,
        n_cols,
    )
    print(f"   ‚Üí Load log updated at: {paths['load_log_path']}")

    return df


if __name__ == "__main__":
    df_loaded = main()
    # df_loaded is available here if you want to quickly inspect or debug
```

### How this maps to your outline

* **1.5.1 Load Raw Dataset**

  * `load_dataset(raw_path)` reads CSV/Parquet from `CONFIG["PATHS"]["RAW_DATA"]`.

* **1.5.2 Hash / Snapshot Validation**

  * `compute_file_hash(raw_path, "sha256")`
  * `upsert_version_registry(...)` reads/updates `dataset_version_registry.csv` and assigns a `version_id`.

* **1.5.3 Schema Version Logging**

  * `append_dataset_load_log(...)` appends a row to `dataset_load_log.csv` with timestamp, version, hash, n_rows, n_cols.

You can now plug this script into your Section 1.5 dependency chain and reference the outputs (`dataset_version_registry.csv`, `dataset_load_log.csv`) in later sections (e.g., your data contract / DQ reports).
