
# Phase 3 - Oncogeriatrics Phenotyping (Clean Notebook)

This notebook rebuilds the phenotyping pipeline from scratch with a clean, reproducible structure.

**What this notebook does**
- Loads the integrated cohort (CSV/Parquet) or generates a synthetic demo if the file is missing
- Applies coherent preprocessing (type casting, imputation, scaling)
- Performs dimensionality reduction (optional) and clustering
- Evaluates internal validity (silhouette, DB, CH), cluster stability (bootstrap ARI), and clinical coherence (simple outcome enrichment)
- Optionally fits survival curves per phenotype if `lifelines` is available
- Exports artifacts (labels, summaries, figures) for the next phases

> Tip: run top to bottom. If you have a real dataset, set `DATA_PATH` in the Config cell.


In [1]:

# =====================
# Config and Imports
# =====================
import os
import sys
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split

# Survival is optional
try:
    from lifelines import KaplanMeierFitter
    LIFELINES_AVAILABLE = True
except Exception:
    LIFELINES_AVAILABLE = False

# Plots - matplotlib only (no seaborn)
import matplotlib.pyplot as plt

# Clean output
warnings.filterwarnings("ignore")

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# I/O
DATA_PATH = ""  # e.g., "/mnt/data/your_integrated_cohort.csv" or ".parquet"
OUTPUT_DIR = "/mnt/data/phase3_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"lifelines available: {LIFELINES_AVAILABLE}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


lifelines available: True
OUTPUT_DIR: /mnt/data/phase3_outputs


In [None]:

# =====================
# Data Loading
# =====================
def load_data(path: str = DATA_PATH) -> pd.DataFrame:
    if path and os.path.exists(path):
        ext = os.path.splitext(path)[1].lower()
        if ext in [".csv"]:
            df = pd.read_csv(path)
        elif ext in [".parquet"]:
            df = pd.read_parquet(path)
        elif ext in [".dta"]:
            df = pd.read_stata(path)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")
        print(f"Loaded real dataset: {path} with shape {df.shape}")
        return df
    else:
        # Synthetic fallback to keep notebook fully runnable
        n = 600
        df = pd.DataFrame({
            "age": np.random.normal(74, 6, n).clip(55, 95),
            "bmi": np.random.normal(26, 4.5, n).clip(16, 45),
            "hemoglobin": np.random.normal(12.5, 1.5, n).clip(7, 18),
            "wbc": np.random.normal(7.2, 2.0, n).clip(2, 20),
            "platelets": np.random.normal(240, 60, n).clip(80, 600),
            "stage": np.random.choice(["I","II","III","IV"], n, p=[0.25,0.35,0.25,0.15]),
            "treatment_line": np.random.choice([1,2,3], n, p=[0.6, 0.3, 0.1]),
            "surgery": np.random.choice([0,1], n, p=[0.4,0.6]),
            "sex": np.random.choice(["F","M"], n),
            # outcomes
            "overall_survival_event": np.random.choice([0,1], n, p=[0.62,0.38]),
            "overall_survival_days": np.random.exponential(700, size=n).astype(int) + np.random.randint(0,200,n),
        })
        # Inject some missingness
        for col in ["bmi", "hemoglobin", "wbc", "platelets"]:
            mask = np.random.rand(n) < 0.06
            df.loc[mask, col] = np.nan
        print(f"Generated synthetic dataset with shape {df.shape}")
        return df

df = load_data()
df.head()


In [None]:

# =====================
# Feature Registry and Cohort Definition
# =====================

NUMERIC_FEATURES = [
    "age","bmi","hemoglobin","wbc","platelets"
]

CATEGORICAL_FEATURES = [
    "stage","treatment_line","surgery","sex"
]

OUTCOME_COLS = ["overall_survival_event", "overall_survival_days"]

def define_cohort(data: pd.DataFrame) -> pd.DataFrame:
    # Example inclusion: non-missing age and stage
    cohort = data.dropna(subset=["age"]).copy()
    # Example bounds
    cohort = cohort[(cohort["age"] >= 55) & (cohort["age"] <= 95)]
    print(f"Cohort size after basic inclusion: {cohort.shape[0]}")
    return cohort

cohort = define_cohort(df)
cohort.sample(5, random_state=RANDOM_SEED)


In [None]:

# =====================
# Preprocessing Pipeline
# =====================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES)
    ]
)

# Optional PCA (set N_COMPONENTS to None to skip)
N_COMPONENTS = 5

def embed_features(X: pd.DataFrame, use_pca: bool = True) -> Tuple[np.ndarray, Dict[str, any]]:
    Xt = preprocessor.fit_transform(X)
    meta = {"preprocessor": preprocessor}
    if use_pca and N_COMPONENTS is not None and N_COMPONENTS > 0:
        pca = PCA(n_components=min(N_COMPONENTS, Xt.shape[1]), random_state=RANDOM_SEED)
        Z = pca.fit_transform(Xt)
        meta["pca"] = pca
        meta["embedding_name"] = "PCA"
        return Z, meta
    else:
        meta["embedding_name"] = "RawScaled"
        return Xt, meta

X = cohort[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
Z, embed_meta = embed_features(X, use_pca=True)
print(embed_meta["embedding_name"], Z.shape)


In [None]:

# =====================
# Clustering
# =====================
@dataclass
class ClusterConfig:
    algorithm: str = "kmeans"   # "kmeans", "gmm", "agg"
    n_clusters: int = 3
    random_state: int = RANDOM_SEED

CONFIG = ClusterConfig(algorithm="kmeans", n_clusters=4, random_state=RANDOM_SEED)

def fit_cluster(Z: np.ndarray, cfg: ClusterConfig) -> Tuple[np.ndarray, object]:
    if cfg.algorithm == "kmeans":
        model = KMeans(n_clusters=cfg.n_clusters, n_init=10, random_state=cfg.random_state)
        labels = model.fit_predict(Z)
    elif cfg.algorithm == "gmm":
        model = GaussianMixture(n_components=cfg.n_clusters, random_state=cfg.random_state)
        labels = model.fit_predict(Z)
    elif cfg.algorithm == "agg":
        model = AgglomerativeClustering(n_clusters=cfg.n_clusters)
        labels = model.fit_predict(Z)
    else:
        raise ValueError(f"Unknown algorithm: {cfg.algorithm}")
    return labels, model

labels, cluster_model = fit_cluster(Z, CONFIG)
cohort = cohort.copy()
cohort["phenotype"] = labels.astype(int)
cohort["phenotype"].value_counts().sort_index()


In [None]:

# =====================
# Internal Validity
# =====================
def internal_validity(z: np.ndarray, labels: np.ndarray) -> dict:
    res = {}
    # Some metrics require at least 2 clusters and no singletons edge-case
    if len(np.unique(labels)) > 1 and z.shape[0] > len(np.unique(labels)):
        res["silhouette"] = float(silhouette_score(z, labels))
        res["calinski_harabasz"] = float(calinski_harabasz_score(z, labels))
        res["davies_bouldin"] = float(davies_bouldin_score(z, labels))
    else:
        res["silhouette"] = np.nan
        res["calinski_harabasz"] = np.nan
        res["davies_bouldin"] = np.nan
    return res

iv = internal_validity(Z, labels)
print(iv)

# Simple 2D scatter if we used PCA >= 2
if Z.shape[1] >= 2:
    plt.figure(figsize=(6,5))
    for k in np.unique(labels):
        sel = labels == k
        plt.scatter(Z[sel,0], Z[sel,1], s=10, alpha=0.7, label=f"Phenotype {k}")
    plt.title("Embedding scatter by phenotype")
    plt.xlabel("PC1" if "pca" in embed_meta else "Dim 1")
    plt.ylabel("PC2" if "pca" in embed_meta else "Dim 2")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "embedding_scatter.png"), dpi=150)
    plt.show()


In [None]:

# =====================
# Stability (Bootstrap ARI)
# =====================
from sklearn.metrics.cluster import adjusted_rand_score

def bootstrap_ari(z: np.ndarray, cfg: ClusterConfig, n_boot: int = 30, sample_frac: float = 0.8) -> float:
    base_labels, _ = fit_cluster(z, cfg)
    n = z.shape[0]
    aris = []
    for b in range(n_boot):
        idx = np.random.choice(n, size=int(n*sample_frac), replace=False)
        boot_labels, _ = fit_cluster(z[idx], cfg)
        # To compute ARI, align to base on the subset only
        aris.append(adjusted_rand_score(base_labels[idx], boot_labels))
    return float(np.mean(aris))

ari = bootstrap_ari(Z, CONFIG, n_boot=25, sample_frac=0.8)
print({"bootstrap_ARI": ari})


In [None]:

# =====================
# Clinical Coherence
# =====================
def summarize_by_phenotype(df_: pd.DataFrame, numeric: list, categorical: list) -> Tuple[pd.DataFrame, pd.DataFrame]:
    num_summary = df_.groupby("phenotype")[numeric].agg(["count","mean","std","median","min","max"])
    # flatten columns
    num_summary.columns = ["_".join([c for c in col if c]) for col in num_summary.columns.to_flat_index()]
    cat_summary = {}
    for c in categorical:
        cat_summary[c] = (df_.groupby(["phenotype", c]).size() / df_.groupby("phenotype").size()).unstack(fill_value=0.0)
    cat_summary = {k: v for k, v in cat_summary.items()}
    return num_summary, cat_summary

num_sum, cat_sums = summarize_by_phenotype(cohort, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
num_sum.head()


In [None]:

# =====================
# Outcome Checks (and optional KM)
# =====================
# Chi-square for event rate differences and ANOVA for survival days (simple illustration)
from scipy.stats import chi2_contingency, f_oneway

# Event rates table by phenotype
event_tab = pd.crosstab(cohort["phenotype"], cohort["overall_survival_event"])
if event_tab.shape[1] == 2:
    chi2, p, dof, exp = chi2_contingency(event_tab.values)
    print({"chi2_event_by_pheno": chi2, "p_value": p})
else:
    print("Event table not binary. Skipping chi2.")

# Survival days ANOVA (crude; real analysis should use survival models)
groups = [g["overall_survival_days"].dropna().values for _, g in cohort.groupby("phenotype")]
if len(groups) > 1:
    F, p = f_oneway(*groups)
    print({"anova_surv_days_by_pheno": F, "p_value": p})

# Kaplan-Meier per phenotype if lifelines is available
if LIFELINES_AVAILABLE:
    km = KaplanMeierFitter()
    plt.figure(figsize=(7,5))
    for k, g in cohort.groupby("phenotype"):
        try:
            km.fit(durations=g["overall_survival_days"], event_observed=g["overall_survival_event"], label=f"Pheno {k}")
            km.plot()
        except Exception:
            pass
    plt.title("Kaplan-Meier curves by phenotype")
    plt.xlabel("Days")
    plt.ylabel("Survival probability")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "km_by_phenotype.png"), dpi=150)
    plt.show()
else:
    print("lifelines not installed. Skipping KM plot.")


In [None]:

# =====================
# Export Artifacts
# =====================
labels_path = os.path.join(OUTPUT_DIR, "phenotype_labels.csv")
summary_path = os.path.join(OUTPUT_DIR, "numeric_summary.csv")

cohort[["phenotype"] + OUTCOME_COLS].to_csv(labels_path, index=False)
num_sum.to_csv(summary_path)

# Save categorical distributions
for k, tab in cat_sums.items():
    outp = os.path.join(OUTPUT_DIR, f"cat_{k}_distribution.csv")
    tab.to_csv(outp)

print({
    "labels_csv": labels_path,
    "numeric_summary_csv": summary_path,
    "figures_example": [
        os.path.join(OUTPUT_DIR, "embedding_scatter.png"),
        os.path.join(OUTPUT_DIR, "km_by_phenotype.png")
    ]
})



## Next steps and how to plug your real data

1. Set `DATA_PATH` in the Config cell to your integrated cohort file (CSV or Parquet recommended).
2. Update `NUMERIC_FEATURES`, `CATEGORICAL_FEATURES`, and `OUTCOME_COLS` to match your schema.
3. Adjust inclusion criteria in `define_cohort` to match clinical definitions for your study.
4. Tune clustering in `ClusterConfig` (algorithm, number of clusters).
5. Re-run the notebook top-to-bottom and review internal validity, stability, and clinical coherence outputs.
6. The generated artifacts in `/mnt/data/phase3_outputs` can feed your translational and validation phases.
