
# Phase 3 - Clinical Validation and Phenotype Reporting

This notebook validates that the discovered phenotypes are clinically meaningful.
It expects these files in `phase3_artifacts/`:
- phase3_integrated_data.csv
- cluster_labels.csv

It will:
1. Merge clusters into the integrated data.
2. Detect or accept survival time and event columns.
3. Plot Kaplan Meier curves by cluster and run pairwise log rank tests.
4. Compare binary outcomes (ADR, readmission, toxicity, mortality) across clusters.
5. Compare continuous outcomes (frailty score, risk index) across clusters.
6. Summarize each phenotype into a compact card for reporting.


In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Optional survival analysis. Install if missing.
try:
    from lifelines import KaplanMeierFitter
    from lifelines.statistics import pairwise_logrank_test
    HAS_LIFELINES = True
except Exception:
    HAS_LIFELINES = False
    print("lifelines not found. Install with: pip install lifelines")

BASE = Path(".")
ART = BASE / "phase3_artifacts"

INTEGRATED = ART / "phase3_integrated_data.csv"
CLUSTERS   = ART / "cluster_labels.csv"

df = pd.read_csv(INTEGRATED)
labels = pd.read_csv(CLUSTERS)
df = df.copy()
df["cluster"] = labels["cluster"].values

print("Shape:", df.shape)
print("Cluster counts:", df["cluster"].value_counts().sort_index().to_dict())
df.head(3)


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


FileNotFoundError: [Errno 2] No such file or directory: 'phase3_artifacts\\phase3_integrated_data.csv'

In [None]:

# Choose survival columns. Override here if the auto-detector is wrong.
SURV_TIME_COL = None      # e.g., "time_to_event_days"
SURV_EVENT_COL = None     # e.g., "event" (1=event/death, 0=censored)

def find_cols(patterns):
    out = []
    for c in df.columns:
        cl = c.lower()
        if any(p in cl for p in patterns):
            out.append(c)
    return list(dict.fromkeys(out))

cand_time  = find_cols(["survival_time","time_to_event","followup","follow_up","days_to","time"])
cand_event = find_cols(["event","death","mortality","status","censor","censored"])

# Heuristic picks if no override is given
if SURV_TIME_COL is None:
    SURV_TIME_COL = next((c for c in cand_time if pd.api.types.is_numeric_dtype(df[c])), None)

if SURV_EVENT_COL is None:
    SURV_EVENT_COL = next((c for c in cand_event if set(pd.Series(df[c]).dropna().unique()).issubset({0,1})), None)

print("Detected time column:", SURV_TIME_COL)
print("Detected event column:", SURV_EVENT_COL)


In [2]:

# Kaplan Meier curves and pairwise log-rank tests
if HAS_LIFELINES and SURV_TIME_COL and SURV_EVENT_COL:
    kmf = KaplanMeierFitter()
    plt.figure(figsize=(7,5))
    for cl in sorted(df["cluster"].unique()):
        sub = df[df["cluster"] == cl]
        kmf.fit(sub[SURV_TIME_COL], event_observed=sub[SURV_EVENT_COL], label=f"Cluster {cl}")
        kmf.plot(ci_show=False)
    plt.title("Kaplan Meier survival by cluster")
    plt.xlabel("Time")
    plt.ylabel("Survival probability")
    plt.tight_layout()
    plt.savefig(ART / "km_curves_by_cluster.png", dpi=150)
    plt.show()

    # Pairwise log rank tests
    res = pairwise_logrank_test(df[SURV_TIME_COL], df["cluster"], df[SURV_EVENT_COL])
    try:
        res.summary.to_csv(ART / "logrank_pairwise.csv")
        display(res.summary.head())
    except Exception:
        print("Wrote pairwise log-rank p values.")
else:
    print("KM was skipped. lifelines not installed or survival columns not found.")


NameError: name 'SURV_TIME_COL' is not defined

In [None]:

# Binary outcomes. Save per cluster rates and chi square tests.
def find_cols(patterns):
    out = []
    for c in df.columns:
        cl = c.lower()
        if any(p in cl for p in patterns):
            out.append(c)
    return list(dict.fromkeys(out))

cand_bin = find_cols(["readmission","adr","toxicity","frailty","mortality","death","event"])

tests = []
for c in cand_bin:
    vals = pd.Series(df[c])
    ok = set(vals.dropna().unique()).issubset({0,1})
    if not ok:
        continue
    # rate table
    tab = df.groupby("cluster")[c].agg(["mean","sum","count"]).rename(columns={"mean":"rate"})
    outp = ART / f"rate_by_cluster__{c}.csv"
    tab.to_csv(outp)
    # chi square test
    cont = pd.crosstab(df["cluster"], df[c])
    if cont.shape[1] == 2:
        chi2, p, dof, _ = stats.chi2_contingency(cont)
        tests.append({"outcome": c, "chi2": chi2, "p_value": p, "dof": dof})

if tests:
    pd.DataFrame(tests).to_csv(ART / "binary_outcome_tests.csv", index=False)
    pd.DataFrame(tests).head()
else:
    print("No binary outcomes detected or they were not strictly 0 or 1.")


In [None]:

# Continuous outcomes. Save per cluster summaries and ANOVA or Kruskal.
cand_cont = []
for c in df.columns:
    if c == "cluster":
        continue
    if pd.api.types.is_numeric_dtype(df[c]):
        cl = c.lower()
        if any(p in cl for p in ["frailty_score","risk_index","score","index","survival_time","time_to_event"]):
            cand_cont.append(c)

tests_c = []
for c in cand_cont:
    tab = df.groupby("cluster")[c].agg(["mean","std","median","count"])
    tab.to_csv(ART / f"cont_by_cluster__{c}.csv")
    groups = [df[df["cluster"]==cl][c].dropna().values for cl in sorted(df["cluster"].unique())]
    # Try ANOVA, fallback to Kruskal
    try:
        f_stat, p = stats.f_oneway(*groups)
        tests_c.append({"outcome": c, "test": "ANOVA", "stat": f_stat, "p_value": p})
    except Exception:
        H, p = stats.kruskal(*groups)
        tests_c.append({"outcome": c, "test": "Kruskal", "stat": H, "p_value": p})

if tests_c:
    pd.DataFrame(tests_c).to_csv(ART / "continuous_outcome_tests.csv", index=False)
    pd.DataFrame(tests_c).head()
else:
    print("No continuous outcomes matched the simple patterns. Edit `cand_cont` logic if needed.")


In [None]:

# Phenotype cards: quick snapshot per cluster using common outcome names
cards = []
clusters = sorted(df["cluster"].unique())
def try_rate(col):
    if col in df.columns and set(pd.Series(df[col]).dropna().unique()).issubset({0,1}):
        return float(df.groupby("cluster")[col].mean().to_dict().get(cl, np.nan))
    return np.nan

for cl in clusters:
    sub = df[df["cluster"]==cl]
    card = {
        "cluster": int(cl),
        "n_patients": int(len(sub)),
    }
    for nm in ["frailty","readmission","adr","toxicity","mortality","death","event"]:
        cols = [c for c in df.columns if nm in c.lower()]
        for c in cols:
            if set(pd.Series(df[c]).dropna().unique()).issubset({0,1}):
                card[f"rate_{c}"] = float(sub[c].mean())
    cards.append(card)

cards_df = pd.DataFrame(cards).sort_values("cluster")
cards_df.to_csv(ART / "phenotype_cards.csv", index=False)
cards_df.head(10)
