In [None]:
ART_DIR = "artifacts"
KMIN, KMAX = 2, 6
RANDOM_STATE = 42
ID_COL = "district_id"

In [None]:
import os, json
import numpy as np
import pandas as pd
import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def load_X(variant):
    return np.load(f"{ART_DIR}/X_{variant}_scaled.npz")["X"]

def train_variant(variant):
    X = load_X(variant)
    best_k, best_score, best_model = None, -1, None
    for k in range(KMIN, KMAX+1):
        km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
        labels = km.fit_predict(X)
        score = silhouette_score(X, labels)
        if score > best_score:
            best_k, best_score, best_model = k, score, km

    joblib.dump(best_model, f"{ART_DIR}/kmeans_{variant}.joblib")

    ids = pd.read_csv(f"{ART_DIR}/district_ids.csv")
    ids[f"cluster_{variant}"] = best_model.labels_
    ids.to_csv(f"{ART_DIR}/clusters_{variant}.csv", index=False)

    meta = {"variant": variant, "best_k": best_k, "silhouette": best_score}
    with open(f"{ART_DIR}/trainmeta_{variant}.json","w") as f: json.dump(meta, f, indent=2)

    print(f"{variant}: k={best_k}, silhouette={best_score:.3f}")

train_variant("full")
train_variant("norace")