In [1]:
ART_DIR = "artifacts"
N_CLUSTERS = 38
RANDOM_STATE = 42
ID_COL = "district_id"

In [2]:
import os, json
import numpy as np
import pandas as pd
import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Loading the Standardized Feature Matrix from the Artifacts Directory
* `load_X` is a helper function that loads the standardized feature matrix from the artifacts directory.

In [3]:
def load_X(variant):
    return np.load(f"{ART_DIR}/X_{variant}_scaled.npz")["X"]

In [4]:
def train_variant(variant, n_clusters=N_CLUSTERS):
    """
    Train KMeans with a fixed number of clusters (default: 38 for 38 districts),
    save the model and per-row cluster labels, and write simple training metadata.
    """
    X = load_X(variant)

    # Fit KMeans once at the desired k (no hyperparameter loop needed)
    km = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X)

    # Optional sanity check: ids length must match X rows
    ids = pd.read_csv(f"{ART_DIR}/district_ids.csv")
    if len(ids) != X.shape[0]:
        raise ValueError(f"Row mismatch: ids={len(ids)} vs X={X.shape[0]} for variant={variant}")


    joblib.dump(km, f"{ART_DIR}/kmeans_{variant}.joblib")
    ids[f"cluster_{variant}"] = labels
    ids.to_csv(f"{ART_DIR}/clusters_{variant}.csv", index=False)


    try:
        from sklearn.metrics import silhouette_score
        sil = float(silhouette_score(X, labels)) if n_clusters > 1 else float("nan")
    except Exception:
        sil = float("nan")

    meta = {"variant": variant, "k": int(n_clusters), "silhouette": sil}
    with open(f"{ART_DIR}/trainmeta_{variant}.json", "w") as f:
        json.dump(meta, f, indent=2)

    print(f"{variant}: k={n_clusters}, silhouette={sil if sil==sil else 'NA'}")

In [5]:
train_variant("full")
train_variant("norace")

full: k=38, silhouette=NA
norace: k=38, silhouette=NA
