# 01 — Data Validation
Validate schema, date continuity, missing values, duplicates, and sector coverage for cleaned processed datasets.


In [None]:
import csv
import math
import statistics
from datetime import datetime
from pathlib import Path

DATA_DIR = Path("data/processed")
REPORTS_DIR = Path("reports")
FIG_DIR = REPORTS_DIR / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

def read_csv(path):
    with open(path, newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))

def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d")

def month_range(start, end):
    months = []
    cur = datetime(start.year, start.month, 1)
    while cur <= end:
        months.append(cur.strftime("%Y-%m-%d"))
        if cur.month == 12:
            cur = datetime(cur.year + 1, 1, 1)
        else:
            cur = datetime(cur.year, cur.month + 1, 1)
    return months


In [None]:
files = {
    "supply": DATA_DIR / "electricity_supply_clean.csv",
    "consumption": DATA_DIR / "electricity_consumption_clean.csv",
    "ipi": DATA_DIR / "ipi_clean.csv",
}
for name, path in files.items():
    print(f"{name}: {path.exists()} -> {path}")


In [None]:
expected_schema = {
    "supply": ["date", "sector", "supply"],
    "consumption": ["date", "sector", "consumption"],
    "ipi": ["series", "date", "index", "index_sa"],
}
validation = {}
for name, path in files.items():
    rows = read_csv(path)
    cols = list(rows[0].keys()) if rows else []
    missing_cells = sum(1 for row in rows for v in row.values() if v is None or str(v).strip() == "")
    duplicates = len(rows) - len({tuple(row[c] for c in cols) for row in rows}) if rows else 0
    dates = sorted({row["date"] for row in rows if row.get("date")})
    continuity_ok = True
    missing_months = []
    if dates:
        expected_months = month_range(parse_date(dates[0]), parse_date(dates[-1]))
        missing_months = [d for d in expected_months if d not in set(dates)]
        continuity_ok = len(missing_months) == 0
    sector_coverage = sorted({row.get("sector", row.get("series", "")) for row in rows})
    validation[name] = {
        "rows": len(rows), "columns": cols, "schema_ok": cols == expected_schema[name],
        "missing_cells": missing_cells, "duplicates": duplicates,
        "date_start": dates[0] if dates else None, "date_end": dates[-1] if dates else None,
        "num_months": len(dates), "date_continuity_ok": continuity_ok,
        "missing_months": missing_months, "coverage": sector_coverage,
    }
validation


In [None]:
summary_lines = ["# Validation Summary", ""]
for name, info in validation.items():
    summary_lines.append(f"## {name.title()}")
    summary_lines.append(f"- Rows: {info['rows']}")
    summary_lines.append(f"- Schema matches expected: {info['schema_ok']}")
    summary_lines.append(f"- Missing cells: {info['missing_cells']}")
    summary_lines.append(f"- Duplicate rows: {info['duplicates']}")
    summary_lines.append(f"- Date range: {info['date_start']} to {info['date_end']} ({info['num_months']} months)")
    summary_lines.append(f"- Date continuity check passed: {info['date_continuity_ok']}")
    if not info['date_continuity_ok']:
        summary_lines.append(f"- Missing months sample: {info['missing_months'][:10]}")
    summary_lines.append(f"- Sector/series coverage: {', '.join(info['coverage'])}")
    summary_lines.append("")
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
summary_path = REPORTS_DIR / "validation_summary.md"
summary_path.write_text("\n".join(summary_lines), encoding="utf-8")
print(f"Saved: {summary_path}")
print("\n".join(summary_lines[:14]))
