## Clinical Phenotyping: PCA + K-Means (3 Phenotypes)

Load `01_cleaned_advanced.csv` → PCA (95% variance) → K-Means k=3 → PCA scatter + Centroid heatmap.

In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import pandas as pd
from sklearn.decomposition import PCA

from src.models import ClinicalClustering
from src.visualization import plot_pca_scatter, plot_centroid_heatmap

DATA_PATH = Path("../data/processed/01_cleaned_advanced.csv")
OUT_PATH = Path("../data/processed/02_clustered_data.csv")
FIG_DIR = Path("../figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
df = pd.read_csv(DATA_PATH)
exclude = {
    "Patient ID", "SARS-Cov-2 exam result", "Patient age quantile",
    "Patient addmited to regular ward (1=yes, 0=no)",
    "Patient addmited to semi-intensive unit (1=yes, 0=no)",
    "Patient addmited to intensive care unit (1=yes, 0=no)",
}
feature_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
X = df[feature_cols].copy()

pca = PCA(n_components=0.95, random_state=42)
X_pca_arr = pca.fit_transform(X)
n_comp = X_pca_arr.shape[1]
X_pca = pd.DataFrame(X_pca_arr, index=X.index, columns=[f"PC{i+1}" for i in range(n_comp)])

clustering = ClinicalClustering(n_clusters=3, method="kmeans")
labels = clustering.fit_predict(X_pca)

df_out = df.copy()
df_out["Cluster"] = labels
df_out.to_csv(OUT_PATH, index=False)
print(f"Clustered: {OUT_PATH} | k=3, PCA components={n_comp}")

In [None]:
pca_df = X_pca[["PC1", "PC2"]].copy()
pca_df["Cluster"] = labels.astype(str)
plot_pca_scatter(
    pca_df, cluster_col="Cluster", title="PCA Projection: K-Means k=3",
    save_path=str(FIG_DIR / "pca_kmeans_k3.png"),
)

In [None]:
centroids = X.groupby(labels).mean()
centroids.index = [f"Cluster {i}" for i in centroids.index]

markers = ["Leukocytes", "Hemoglobin", "Platelets", "Lymphocytes", "Neutrophils", "Urea", "Creatinine"]
plot_centroid_heatmap(
    centroids,
    markers=[m for m in markers if m in centroids.columns],
    title="Cluster Centroids (Key Markers): Clinical Story",
    save_path=str(FIG_DIR / "centroid_heatmap_k3.png"),
)