## Constants

In [1]:
ART_DIR = "artifacts"
N_CLUSTERS = 38
RANDOM_STATE = 42
ID_COL = "district_id"

## Imports

In [2]:
import os, json
import numpy as np
import pandas as pd
import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Helper Functions

`ensure_artifacts_dir()` is a helper function that makes sure the artifacts directory exists before writing any files.

In [None]:
def ensure_artifacts_dir():
    os.makedirs(ART_DIR, exist_ok=True)

`load_X` is a helper function that loads the feature matrix for a given variant, either full or no race.

In [None]:
def load_X(variant: str) -> np.ndarray:
    """
    Load pre-scaled feature matrix saved by the preprocess notebook.
    Expects a .npz with key 'X'.
    """
    path = f"{ART_DIR}/X_{variant}_scaled.npz"
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Could not find {path}. Make sure your preprocess step saved this file."
        )
    arr = np.load(path)
    if "X" not in arr:
        raise KeyError(f"'X' not found in {path}. Check how the array was saved.")
    return arr["X"]

`load_ids()` is a helper function that loads the district IDs for each row in the feature matrix.

In [None]:
def load_ids() -> pd.DataFrame:
    """
    Load row identifiers (must align 1:1 with rows in X).
    """
    path = f"{ART_DIR}/district_ids.csv"
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Missing {path}. Your preprocess should export this with {ID_COL}."
        )
    ids = pd.read_csv(path)
    if ID_COL not in ids.columns:
        raise KeyError(
            f"Expected column '{ID_COL}' in {path}, found {ids.columns.tolist()}."
        )
    return ids

## Training
* `train_variant` is the training function for one variant (full or no race).
* The corresponding feature matrix and the district IDs are loaded. Both are required for model training and exporting aligned labels.
* A 1:1 row alignment between IDs and features is asserted.

In [None]:
def train_variant(variant: str, n_clusters: int = N_CLUSTERS):
    """
    Train KMeans at a fixed k (default: 38), save the model, labels, and training metadata.
    """
    ensure_artifacts_dir()

    # Load data
    X = load_X(variant)
    ids = load_ids()

    if X.ndim != 2:
        raise ValueError(f"X must be 2D (n_samples, n_features). Got shape {X.shape}.")
    if len(ids) != X.shape[0]:
        raise ValueError(
            f"Row mismatch for variant='{variant}': "
            f"ids={len(ids)} vs X={X.shape[0]}.\n"
            f"Ensure your preprocess produced district_ids.csv aligned with X."
        )

    # Fit KMeans once at desired k
    km = KMeans(
        n_clusters=n_clusters,
        random_state=RANDOM_STATE,
        n_init=10,
    )
    labels = km.fit_predict(X)

    # Save model
    model_path = f"{ART_DIR}/kmeans_{variant}.joblib"
    joblib.dump(km, model_path)

    # Save labels merged with IDs
    out_labels_path = f"{ART_DIR}/clusters_{variant}.csv"
    out_ids = ids.copy()
    out_ids[f"cluster_{variant}"] = labels
    out_ids.to_csv(out_labels_path, index=False)

    # Metrics
    inertia = float(getattr(km, "inertia_", float("nan")))
    if n_clusters > 1 and X.shape[0] > 1:
        try:
            sil = float(silhouette_score(X, labels))
        except Exception:
            sil = float("nan")
    else:
        sil = float("nan")

    # Persist metadata
    meta = {
        "variant": variant,
        "k": int(n_clusters),
        "random_state": int(RANDOM_STATE),
        "n_init": "10",
        "inertia": inertia,
        "silhouette": sil,
        "n_samples": int(X.shape[0]),
        "n_features": int(X.shape[1]),
        "ids_file": f"{ART_DIR}/district_ids.csv",
        "X_file": f"{ART_DIR}/X_{variant}_scaled.npz",
        "model_file": model_path,
        "labels_file": out_labels_path,
        "id_col": ID_COL,
    }
    with open(f"{ART_DIR}/trainmeta_{variant}.json", "w") as f:
        json.dump(meta, f, indent=2)

    print(
        f"[{variant}] k={n_clusters} | inertia={inertia:.3f} | "
        f"silhouette={sil if sil==sil else 'NA'} | "
        f"X shape={X.shape}"
    )

## Run Both Variants

In [None]:
train_variant("full")
train_variant("norace")