In [25]:
# ==============================================================
# PROJECT: Therapeutic Failure Phenotype Discovery
# PHASE: Unsupervised Clustering (Phenotype Discovery)
# FINAL VERSION — ALSO PREPARES REAL-WORLD CLUSTER FILE
# ==============================================================

import os
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

# Start timer
start = time.time()



In [26]:
# --------------------------------------------------------------
# CONFIG
# --------------------------------------------------------------
BASE_DIR  = r"D:\ML_Project"
DATA_DIR  = os.path.join(BASE_DIR, "data", "processed")
MODELS_DIR = os.path.join(BASE_DIR, "models")
LOG_DIR    = os.path.join(BASE_DIR, "logs")

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

SAMPLE_TRAIN_EVAL  = os.path.join(DATA_DIR, "sample_train_eval.csv")
SAMPLE_REALWORLD   = os.path.join(DATA_DIR, "sample_realworld_test.csv")

OUTPUT_CLUSTERED      = os.path.join(DATA_DIR, "PROJECT_CLUSTERED_SEVERITY_DATA.csv")
REALWORLD_CLUSTERED   = os.path.join(DATA_DIR, "REALWORLD_CLUSTERED.csv")

TFIDF_PATH = os.path.join(MODELS_DIR, "tfidf_vectorizer.joblib")
SVD_PATH   = os.path.join(MODELS_DIR, "truncated_svd.joblib")
KMEANS_PATH= os.path.join(MODELS_DIR, "kmeans_failure_pheno.joblib")
LABELMAP_PATH = os.path.join(MODELS_DIR, "cluster_label_mapping.csv")

TFIDF_PARAMS = {
    "max_features": 2000,
    "ngram_range": (1, 2),
    "min_df": 15,
    "max_df": 0.9
}

SVD_COMPONENTS = 50
K_RANGE = range(2, 7)
FORCE_K = 3
RANDOM_STATE = 42

print("Clustering Pipeline Config Loaded.")


Clustering Pipeline Config Loaded.


In [27]:
# --------------------------------------------------------------
# LOAD TRAIN/EVAL SAMPLE
# --------------------------------------------------------------
print("\nLoading sample_train_eval.csv ...")
df = pd.read_csv(SAMPLE_TRAIN_EVAL, low_memory=False)

df_fail = df[df["is_failure"] == 1].copy()
print(f"Failure rows available: {len(df_fail):,}")

if len(df_fail) < 500:
    raise ValueError("Not enough failure rows for clustering.")

# --------------------------------------------------------------
# TEXT SEVERITY HEURISTIC
# --------------------------------------------------------------

def categorize_severity(text):
    t = str(text).lower()
    critical = ["death", "cardiac arrest", "respiratory failure", "shock", "coma", "organ failure"]
    hospital = ["hospital", "emergency", "infection", "pneumonia", "sepsis", "treatment failure"]
    mild = ["headache", "nausea", "vomiting", "rash", "fatigue", "dizziness"]

    if any(x in t for x in critical): return "Critical_Failure"
    if any(x in t for x in hospital): return "Hospitalization_Failure"
    if any(x in t for x in mild): return "SideEffect_Failure"
    return "SideEffect_Failure"

df_fail["severity_category"] = df_fail["all_reaction_pts"].apply(categorize_severity)

weight_map = {"Critical_Failure": 3.0, "Hospitalization_Failure": 2.0, "SideEffect_Failure": 1.0}
df_fail["severity_weight"] = df_fail["severity_category"].map(weight_map)

# --------------------------------------------------------------
# TRAIN / HOLDOUT SPLIT
# --------------------------------------------------------------
train_df, holdout_df = train_test_split(
    df_fail, test_size=0.20, random_state=RANDOM_STATE,
    stratify=df_fail["severity_category"]
)

print(f"Train: {len(train_df):,} | Holdout: {len(holdout_df):,}")

# --------------------------------------------------------------
# TF-IDF + SVD
# --------------------------------------------------------------
print("\nFitting TF-IDF...")
custom_stopwords = sorted(set(ENGLISH_STOP_WORDS).union({
    "drug","effect","reaction","report","adverse","therapy","patient","suspect","treatment"
}))
TFIDF_PARAMS["stop_words"] = custom_stopwords

tfidf = TfidfVectorizer(**TFIDF_PARAMS)

X_train_tfidf = tfidf.fit_transform(train_df["all_reaction_pts"])
X_holdout_tfidf = tfidf.transform(holdout_df["all_reaction_pts"])

print("Applying SVD...")
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
X_train_red = svd.fit_transform(X_train_tfidf)
X_holdout_red = svd.transform(X_holdout_tfidf)



Loading sample_train_eval.csv ...
Failure rows available: 249,048
Train: 199,238 | Holdout: 49,810

Fitting TF-IDF...
Applying SVD...


In [28]:
# --------------------------------------------------------------
# FIND BEST K (unless forced)
# --------------------------------------------------------------
print("\nEvaluating K (silhouette)...")

sil_scores = {}
for k in K_RANGE:
    km = KMeans(n_clusters=k, n_init=10,random_state=RANDOM_STATE)
    labels = km.fit_predict(X_train_red)
    s = silhouette_score(X_train_red, labels)
    sil_scores[k] = s
    print(f"K={k} → silhouette={s:.4f}")

best_k = FORCE_K if FORCE_K is not None else max(sil_scores, key=sil_scores.get)
print(f"\nSelected K = {best_k}")

# --------------------------------------------------------------
# FINAL KMEANS FIT
# --------------------------------------------------------------
kmeans = KMeans(n_clusters=best_k, n_init=25, random_state=RANDOM_STATE)

train_labels = kmeans.fit_predict(X_train_red)
holdout_labels = kmeans.predict(X_holdout_red)

train_df["failure_phenotype"] = train_labels
holdout_df["failure_phenotype"] = holdout_labels

label_map = {i: f"Cluster_{i}" for i in range(best_k)}
train_df["failure_phenotype_label"] = train_df["failure_phenotype"].map(label_map)
holdout_df["failure_phenotype_label"] = holdout_df["failure_phenotype"].map(label_map)

# --------------------------------------------------------------
# SAVE CLUSTERED TRAIN+HOLDOUT
# --------------------------------------------------------------
combined = pd.concat([train_df, holdout_df], ignore_index=True)
combined.to_csv(OUTPUT_CLUSTERED, index=False)

print(f"\nSaved clustered TRAIN/EVAL dataset → {OUTPUT_CLUSTERED}")

# --------------------------------------------------------------
# SAVE CLUSTERING ARTIFACTS
# --------------------------------------------------------------
joblib.dump(tfidf, TFIDF_PATH)
joblib.dump(svd,   SVD_PATH)
joblib.dump(kmeans, KMEANS_PATH)
pd.Series(label_map).to_csv(LABELMAP_PATH)

print("\nArtifacts saved.")

# --------------------------------------------------------------
# REAL-WORLD CLUSTERING (NO LEAKAGE)
# --------------------------------------------------------------
print("\nLoading REAL-WORLD TEST sample...")
df_real = pd.read_csv(SAMPLE_REALWORLD, low_memory=False)

if "all_reaction_pts" not in df_real:
    raise KeyError("real-world sample missing 'all_reaction_pts' column.")

print("Applying TF-IDF → SVD → KMeans to real-world data...")

real_tfidf = tfidf.transform(df_real["all_reaction_pts"].astype(str))
real_svd   = svd.transform(real_tfidf)
real_clusters = kmeans.predict(real_svd)

df_real["failure_phenotype"] = real_clusters
df_real["failure_phenotype_label"] = df_real["failure_phenotype"].map(label_map)

df_real.to_csv(REALWORLD_CLUSTERED, index=False)

print(f"Saved REAL-WORLD clustered dataset → {REALWORLD_CLUSTERED}")


Evaluating K (silhouette)...
K=2 → silhouette=0.3895
K=3 → silhouette=0.4526
K=4 → silhouette=0.3588
K=5 → silhouette=0.3662
K=6 → silhouette=0.3577

Selected K = 3

Saved clustered TRAIN/EVAL dataset → D:\ML_Project\data\processed\PROJECT_CLUSTERED_SEVERITY_DATA.csv

Artifacts saved.

Loading REAL-WORLD TEST sample...
Applying TF-IDF → SVD → KMeans to real-world data...
Saved REAL-WORLD clustered dataset → D:\ML_Project\data\processed\REALWORLD_CLUSTERED.csv


In [29]:
# --------------------------------------------------------------
# SILHOUETTE SUMMARY
# --------------------------------------------------------------
sil_train = silhouette_score(X_train_red, train_labels)
sil_all   = silhouette_score(
    np.vstack([X_train_red, X_holdout_red]),
    np.concatenate([train_labels, holdout_labels])
)

summary = f"""
CLUSTERING SUMMARY
===================
Train failures: {len(train_df):,}
Holdout failures: {len(holdout_df):,}
Final K: {best_k}

Silhouette (train):    {sil_train:.4f}
Silhouette (combined): {sil_all:.4f}

Runtime: {round(time.time() - start, 2)} seconds
"""

print(summary)
with open(os.path.join(LOG_DIR, "clustering_summary.txt"), "w") as f:
    f.write(summary)

print("Clustering pipeline complete.")



CLUSTERING SUMMARY
Train failures: 199,238
Holdout failures: 49,810
Final K: 3

Silhouette (train):    0.4526
Silhouette (combined): 0.4526

Runtime: 2369.21 seconds

Clustering pipeline complete.
