In [1]:
# ==============================================================
# PROJECT: Therapeutic Failure Phenotype Discovery
# PHASE: Unsupervised Clustering (Phenotype Discovery)
# FILE: 03_clustering_pipeline_minimal.py
# --------------------------------------------------------------
# FINAL VERSION — only essential outputs:
#   • PROJECT_CLUSTERED_SEVERITY_DATA.csv
#   • tfidf_vectorizer.joblib
#   • truncated_svd.joblib
#   • kmeans_failure_pheno.joblib
# ==============================================================

import os
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

In [2]:
# --------------------------------------------------------------
# 0) CONFIGURATION
# --------------------------------------------------------------
BASE_DIR = r"D:\ML_Project"
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
MODELS_DIR = os.path.join(BASE_DIR, "models")
LOG_DIR = os.path.join(BASE_DIR, "logs")

SAMPLE_TRAIN_EVAL = os.path.join(DATA_DIR, "sample_train_eval.csv")
OUT_CLUSTERED_FILE = os.path.join(DATA_DIR, "PROJECT_CLUSTERED_SEVERITY_DATA.csv")

TFIDF_ARTIFACT = os.path.join(MODELS_DIR, "tfidf_vectorizer.joblib")
SVD_ARTIFACT = os.path.join(MODELS_DIR, "truncated_svd.joblib")
KMEANS_ARTIFACT = os.path.join(MODELS_DIR, "kmeans_failure_pheno.joblib")

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

TFIDF_PARAMS = {
    "max_features": 2000,
    "ngram_range": (1, 2),
    "min_df": 20,
    "max_df": 0.9,
}
SVD_COMPONENTS = 50
K_RANGE = range(2, 7)
FORCE_K = 3  # set None for auto select
RANDOM_STATE = 42

In [3]:
# --------------------------------------------------------------
# 1) LOAD SAMPLE (NOT FULL MASTER)
# --------------------------------------------------------------
start = time.time()
print("Loading sample_train_eval...")

df = pd.read_csv(SAMPLE_TRAIN_EVAL, low_memory=False)
print(f"Loaded sample: {len(df):,} rows")

df_fail = df[df["is_failure"] == 1].copy()
print(f"Failure rows for clustering: {len(df_fail):,}")

if len(df_fail) < 500:
    raise ValueError("Too few failure rows. Sampling fraction too small.")


# --------------------------------------------------------------
# 2) SEVERITY TAGGING (TEXT-BASED HEURISTIC)
# --------------------------------------------------------------
def categorize_severity(text: str) -> str:
    t = str(text).lower()
    critical = [
        "death",
        "cardiac arrest",
        "respiratory failure",
        "shock",
        "coma",
        "multi organ failure",
    ]
    hospital = [
        "hospital",
        "emergency",
        "infection",
        "pneumonia",
        "sepsis",
        "treatment failure",
    ]
    mild = ["headache", "nausea", "vomiting", "rash", "fatigue", "dizziness"]

    if any(x in t for x in critical):
        return "Critical_Failure"
    if any(x in t for x in hospital):
        return "Hospitalization_Failure"
    if any(x in t for x in mild):
        return "SideEffect_Failure"
    return "SideEffect_Failure"


df_fail["severity_category"] = df_fail["all_reaction_pts"].apply(categorize_severity)

weight_map = {
    "Critical_Failure": 3.0,
    "Hospitalization_Failure": 2.0,
    "SideEffect_Failure": 1.0,
}
df_fail["severity_weight"] = df_fail["severity_category"].map(weight_map)

Loading sample_train_eval...
Loaded sample: 701,665 rows
Failure rows for clustering: 249,048


In [4]:
# --------------------------------------------------------------
# 3) TRAIN / HOLDOUT SPLIT
# --------------------------------------------------------------
train_df, holdout_df = train_test_split(
    df_fail,
    test_size=0.20,
    random_state=RANDOM_STATE,
    stratify=df_fail["severity_category"],
)

print(f"Train failures: {len(train_df):,} | Holdout: {len(holdout_df):,}")

# --------------------------------------------------------------
# 4) TF-IDF + SVD
# --------------------------------------------------------------
custom_stopwords = sorted(
    set(ENGLISH_STOP_WORDS).union(
        {
            "drug",
            "effect",
            "reaction",
            "report",
            "adverse",
            "therapy",
            "patient",
            "suspect",
            "treatment",
        }
    )
)
TFIDF_PARAMS["stop_words"] = custom_stopwords

print("Fitting TF-IDF...")
vectorizer = TfidfVectorizer(**TFIDF_PARAMS)
X_train_tfidf = vectorizer.fit_transform(train_df["all_reaction_pts"])
X_holdout_tfidf = vectorizer.transform(holdout_df["all_reaction_pts"])

print("Applying SVD...")
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
X_train_red = svd.fit_transform(X_train_tfidf)
X_holdout_red = svd.transform(X_holdout_tfidf)

# --------------------------------------------------------------
# 5) K SEARCH (FAST)
# --------------------------------------------------------------
print("\nFinding best K...")

sil_scores = {}
for k in K_RANGE:
    km = KMeans(n_clusters=k, n_init=10, random_state=RANDOM_STATE)
    labels = km.fit_predict(X_train_red)
    sil_scores[k] = silhouette_score(X_train_red, labels)
    print(f"K={k} → silhouette={sil_scores[k]:.4f}")

best_k = FORCE_K if FORCE_K is not None else max(sil_scores, key=sil_scores.get)
print(f"Selected K = {best_k}")

# --------------------------------------------------------------
# 6) FINAL KMEANS FIT + LABEL ASSIGNMENT
# --------------------------------------------------------------
print("Fitting final KMeans...")
kmeans = KMeans(n_clusters=best_k, n_init=20, random_state=RANDOM_STATE)

train_labels = kmeans.fit_predict(X_train_red)
holdout_labels = kmeans.predict(X_holdout_red)

train_df["failure_phenotype"] = train_labels
holdout_df["failure_phenotype"] = holdout_labels

label_map = {i: f"Cluster_{i}" for i in range(best_k)}

train_df["failure_phenotype_label"] = train_df["failure_phenotype"].map(label_map)
holdout_df["failure_phenotype_label"] = holdout_df["failure_phenotype"].map(label_map)

Train failures: 199,238 | Holdout: 49,810
Fitting TF-IDF...
Applying SVD...

Finding best K...
K=2 → silhouette=0.3888
K=3 → silhouette=0.4526
K=4 → silhouette=0.3517
K=5 → silhouette=0.3417
K=6 → silhouette=0.3915
Selected K = 3
Fitting final KMeans...


In [5]:
# --------------------------------------------------------------
# 7) SAVE FINAL DATASET
# --------------------------------------------------------------
combined = pd.concat([train_df, holdout_df], ignore_index=True)
combined.to_csv(OUT_CLUSTERED_FILE, index=False)

print(f"\nSaved → {OUT_CLUSTERED_FILE}")
print(f"Rows saved: {len(combined):,}")

# --------------------------------------------------------------
# 8) SAVE ARTIFACTS
# --------------------------------------------------------------
joblib.dump(vectorizer, TFIDF_ARTIFACT)
joblib.dump(svd, SVD_ARTIFACT)
joblib.dump(kmeans, KMEANS_ARTIFACT)

print("\nArtifacts saved:")
print(" - TF-IDF")
print(" - SVD")
print(" - KMeans")


Saved → D:\ML_Project\data\processed\PROJECT_CLUSTERED_SEVERITY_DATA.csv
Rows saved: 249,048

Artifacts saved:
 - TF-IDF
 - SVD
 - KMeans


In [6]:
# --------------------------------------------------------------
# 9) SILHOUETTE REPORT
# --------------------------------------------------------------
sil_train = silhouette_score(X_train_red, train_labels)
sil_combined = silhouette_score(
    np.vstack([X_train_red, X_holdout_red]),
    np.concatenate([train_labels, holdout_labels]),
)

summary = f"""
CLUSTERING SUMMARY
===================
Train failures: {len(train_df):,}
Holdout failures: {len(holdout_df):,}
Final K: {best_k}
Silhouette (train): {sil_train:.4f}
Silhouette (combined): {sil_combined:.4f}
Runtime: {round(time.time() - start, 2)} sec
"""

print(summary)
with open(os.path.join(LOG_DIR, "clustering_summary.txt"), "w") as f:
    f.write(summary)

print("\nClustering pipeline complete.")


CLUSTERING SUMMARY
Train failures: 199,238
Holdout failures: 49,810
Final K: 3
Silhouette (train): 0.4526
Silhouette (combined): 0.4517
Runtime: 2391.68 sec


Clustering pipeline complete.
