# 01 - Data Exploration & Preprocessing
Purpose: reproducible, Windows-friendly data ingestion and validation pipeline.
Principles: single responsibility functions, clear typing, deterministic outputs.

Usage:
 - Run cell-by-cell.
 - Outputs: `data/raw/iris.csv`, `data/processed/train.csv`, `data/processed/val.csv`


In [None]:
"""
High-level imports and environment setup.
"""

# Standard library
from pathlib import Path
from typing import Tuple, Dict

# Third-party
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import logging

# Configure
logging.basicConfig(level=logging.INFO)
ROOT = Path.cwd()
DATA_DIR = ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


## 1.1 — deterministic sample dataset (Iris)
We use Iris as a reproducible example dataset. In production, replace loader with GCS/S3 connector.


In [None]:
def load_sample_dataset() -> pd.DataFrame:
    """
    Load a reproducible example dataset (Iris).
    Returns:
        pd.DataFrame: features + target
    """
    from sklearn.datasets import load_iris
    iris = load_iris(as_frame=True)
    df = pd.concat([iris.frame, iris.target.rename("target")], axis=1)
    logging.info("Loaded iris dataset with shape %s", df.shape)
    return df

df = load_sample_dataset()
df.to_csv(RAW_DIR / "iris.csv", index=False)
df.head()


## 1.2 — Schema and validation utilities
Minimal deterministic schema check suitable for CI unit tests.


In [None]:
SCHEMA = {
    "feature_cols": ["sepal length (cm)","sepal width (cm)","petal length (cm)","petal width (cm)"],
    "target_col": "target",
    "n_features": 4,
}

def validate_schema(df: pd.DataFrame, schema: Dict) -> None:
    """Raise ValueError on schema mismatch."""
    missing = [c for c in schema["feature_cols"] + [schema["target_col"]] if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    if df[schema["feature_cols"]].isna().any().any():
        raise ValueError("NaNs present in feature columns")
    logging.info("Schema validation passed")


In [None]:
validate_schema(df, SCHEMA)


## 1.3 — Deterministic preprocessing pipeline
Single function that performs transformations and returns train/val splits.


In [None]:
def preprocess_and_split(df: pd.DataFrame, test_size: float=0.2, random_seed: int=42) -> Tuple[pd.DataFrame,pd.DataFrame]:
    """
    Deterministically preprocesses the DataFrame and returns train and validation splits.
    - Simple standardization using train statistics (no data leakage).
    """
    feature_cols = SCHEMA["feature_cols"]
    target = SCHEMA["target_col"]

    X = df[feature_cols].astype(float)
    y = df[target].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_seed, stratify=y)

    # Standardize (fit on train only)
    mu = X_train.mean(axis=0)
    sigma = X_train.std(axis=0).replace(0,1.0)

    X_train_scaled = (X_train - mu) / sigma
    X_val_scaled = (X_val - mu) / sigma

    train = pd.concat([X_train_scaled.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
    val = pd.concat([X_val_scaled.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)

    # Persist standardization metadata
    (PROCESSED_DIR / "scaler.json").write_text(json.dumps({"mu": mu.to_dict(), "sigma": sigma.to_dict()}))
    train.to_csv(PROCESSED_DIR / "train.csv", index=False)
    val.to_csv(PROCESSED_DIR / "val.csv", index=False)

    logging.info("Preprocessing complete. Train / Val shapes: %s / %s", train.shape, val.shape)
    return train, val

train_df, val_df = preprocess_and_split(df)
train_df.head()


## 1.4 — Quick EDA checks (these are lightweight and work in CI)
We avoid heavy plotting here; CI-friendly assertions follow.


In [None]:
def basic_data_checks(train: pd.DataFrame, val: pd.DataFrame) -> None:
    """
    Basic assertions to fail fast in CI:
    - No NaNs
    - Correct number of features
    - Class balance check (not too imbalanced)
    """
    assert not train.isna().any().any(), "NaNs in train"
    assert not val.isna().any().any(), "NaNs in val"
    assert train.shape[1] == SCHEMA["n_features"] + 1, "Unexpected number of columns in train"
    # class balance check: not required but informative
    class_counts = train[SCHEMA["target_col"]].value_counts(normalize=True).to_dict()
    logging.info("Train class distribution: %s", class_counts)

basic_data_checks(train_df, val_df)


End of notebook 01.
