## Evaluate clustering using Silhouette Score.

## Evaluate clustering using Adjusted Rand index.

## Evaluate classification using the classification report.

In [None]:
import os, json
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def load_X(variant):
    return np.load(f"{ART_DIR}/X_{variant}_scaled.npz")["X"]

df = pd.read_csv(f"{ART_DIR}/dataset_snapshot.csv")

def eval_variant(variant):
    clusters_path = f"{ART_DIR}/clusters_{variant}.csv"
    meta_path = f"{ART_DIR}/trainmeta_{variant}.json"
    if not (os.path.exists(clusters_path) and os.path.exists(meta_path)):
        print("Missing", variant)
        return

    with open(meta_path) as f: meta = json.load(f)
    print("="*50, f"\\nVariant: {variant}")
    print("Best k:", meta["best_k"], "Silhouette:", meta["silhouette"])

    clusters = pd.read_csv(clusters_path)
    merged = df.merge(clusters, on=ID_COL, how="left")
    cluster_col = f"cluster_{variant}"

    if set(RACE_COLS).issubset(merged.columns):
        print("\\nAverage racial composition:")
        print(merged.groupby(cluster_col)[RACE_COLS].mean())

    X = load_X(variant)
    labels = merged[cluster_col]
    print("Silhouette (recomputed):", silhouette_score(X, labels))

    pca = PCA(2, random_state=42)
    Xt = pca.fit_transform(X)
    plt.scatter(Xt[:,0], Xt[:,1], c=labels)
    plt.title(f"PCA â€” {variant}")
    plt.show()

eval_variant("full")
eval_variant("norace")

# Compare stability
cf = pd.read_csv(f"{ART_DIR}/clusters_full.csv")
cn = pd.read_csv(f"{ART_DIR}/clusters_norace.csv")
m = cf.merge(cn, on=ID_COL)
ari = adjusted_rand_score(m["cluster_full"], m["cluster_norace"])
print("\\nStability (ARI full vs. no-race):", ari)