In [1]:
# pip install pycox torchtuples scikit-survival sklearn-pandas pyarrow
# If PyTorch is missing:
# pip install torch --index-url https://download.pytorch.org/whl/cpu

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

from pycox.models import CoxTime
from pycox.models.cox_time import MLPVanillaCoxTime
import torchtuples as tt


In [2]:
# =========================
# Config
# =========================
SEED = 42
EPOCHS = 80
BATCH_SIZE = 128
LR = 3e-4
WEIGHT_DECAY = 1e-4
HIDDEN = [256, 128]
VAL_FRAC = 0.2
HORIZON_QUANTILE = 0.7  # quantile des temps d'événement pour définir t*

np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1866cf85df0>

In [3]:
# =========================
# Données
# =========================
# df: doit contenir OS_YEARS, OS_STATUS et des features
try:
    df = pd.read_csv("../data/df_train_scaled.csv")  # si déjà en mémoire, on l'utilise
except NameError:
    from pycox.datasets import metabric
    df = metabric.read_df().rename(columns={"duration": "OS_YEARS", "event": "OS_STATUS"})
    print("Demo METABRIC. Remplace par ton dataframe `df` avec OS_YEARS, OS_STATUS et tes features.")

target_time = "OS_YEARS"
target_event = "OS_STATUS"
feature_cols = [c for c in df.columns if c not in [target_time, target_event]]

In [4]:
# =========================
# Split
# =========================
df_trainval, df_test = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df[target_event])
df_train, df_val = train_test_split(df_trainval, test_size=VAL_FRAC, random_state=SEED, stratify=df_trainval[target_event])

def xy(df_):
    X = df_[feature_cols].copy()
    y_time = df_[target_time].to_numpy(dtype="float32")
    y_event = df_[target_event].to_numpy(dtype="int64")
    return X, y_time, y_event

X_train, t_train, e_train = xy(df_train)
X_val,   t_val,   e_val   = xy(df_val)
X_test,  t_test,  e_test  = xy(df_test)

In [5]:
# =========================
# Préprocessing float32
# =========================
num_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
cat_cols = [c for c in X_train.columns if c not in num_cols]

pre = Pipeline(steps=[
    ("ct", ColumnTransformer([
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.float32), cat_cols),
    ], remainder="drop")),
])

pre.fit(X_train)
Xtr = pre.transform(X_train).astype(np.float32)
Xva = pre.transform(X_val).astype(np.float32)
Xte = pre.transform(X_test).astype(np.float32)

t_train = t_train.astype(np.float32)
t_val   = t_val.astype(np.float32)
t_test  = t_test.astype(np.float32)
e_train = e_train.astype(np.int64)
e_val   = e_val.astype(np.int64)
e_test  = e_test.astype(np.int64)


In [6]:
# =========================
# Cox-Time + coxtimemlp
# =========================
net = MLPVanillaCoxTime(
    in_features=Xtr.shape[1],
    num_nodes=HIDDEN,
    batch_norm=True,
    dropout=0.05,
)

optimizer = tt.optim.Adam(lr=LR, weight_decay=WEIGHT_DECAY)
model = CoxTime(net, optimizer, device="cpu")

# Label transform Cox-Time
labtrans = CoxTime.label_transform()
y_tr = labtrans.fit_transform(t_train, e_train)
y_va = labtrans.transform(t_val, e_val)

# Entraînement
callbacks = [tt.callbacks.EarlyStopping(patience=12)]
_ = model.fit(Xtr, y_tr, BATCH_SIZE, EPOCHS, callbacks, verbose=False, val_data=(Xva, y_va))


  return self.tuple_.apply(lambda x: x[index])


In [7]:
# =========================
# Survie prédite
# =========================
model.compute_baseline_hazards()

surv_te = model.predict_surv_df(Xte)             # index = grille de temps
time_grid = surv_te.index.values.astype(float)

# t* depuis les temps d'événement d'entraînement, projeté sur la grille
event_times_train = t_train[e_train.astype(bool) == 1]
if event_times_train.size == 0:
    raise ValueError("Aucun événement dans le train. Cox-Time et Uno IPCW en ont besoin.")
t_star_raw = float(np.quantile(event_times_train, HORIZON_QUANTILE))
t_star = float(time_grid[np.argmin(np.abs(time_grid - t_star_raw))])

In [8]:
# =========================
# τ sûr via KM de la censure
# =========================
t_km, Ghat = kaplan_meier_estimator(~e_train.astype(bool), t_train.astype(float))
if np.any(Ghat > 0):
    tau_max = t_km[Ghat > 0][-1]
    tau_safe = float(np.nextafter(tau_max, 0.0))  # léger retrait
else:
    raise ValueError("Aucune observation censurée dans le train. Uno IPCW est indéfini.")

# Clamp de l’horizon à τ
t_star_clamped = min(t_star, tau_safe)
t_star_clamped = float(time_grid[np.argmin(np.abs(time_grid - t_star_clamped))])

# Risk = 1 - S(t*)
risk_test = 1.0 - surv_te.loc[t_star_clamped].to_numpy()

In [9]:
# --- IPCW-C sur plusieurs horizons + IC bootstrap ---
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

y_train_struct = Surv.from_arrays(event=e_train.astype(bool), time=t_train.astype(float))
y_test_struct  = Surv.from_arrays(event=e_test.astype(bool),  time=t_test.astype(float))

def nearest_time(t, grid):
    g = np.asarray(grid, dtype=float)
    return float(g[np.argmin(np.abs(g - t))])

# Horizons: quantiles d'événements train, clampés à tau
q_list = [0.25, 0.5, 0.75, 0.9]
event_times_train = t_train[e_train.astype(bool)]
eval_times = []
for q in q_list:
    tq = float(np.quantile(event_times_train, q))
    tq = min(tq, tau_safe)
    eval_times.append(nearest_time(tq, surv_te.index.values))

def ipcw_at_t(t_eval, idx=None):
    # risk = 1 - S(t_eval)
    s = surv_te.loc[t_eval].to_numpy()
    risk = 1.0 - s
    if idx is not None:
        return concordance_index_ipcw(y_train_struct,
                                      Surv.from_arrays(event=e_test[idx].astype(bool),
                                                       time=t_test[idx].astype(float)),
                                      risk[idx], tau=tau_safe)[0]
    return concordance_index_ipcw(y_train_struct, y_test_struct, risk, tau=tau_safe)[0]

# Scores par horizon
for t_eval in eval_times:
    c = ipcw_at_t(t_eval)
    print(f"IPCW-C at t={t_eval:.3f}y: {c:.4f}")

# Bootstrap 95% CI au t* utilisé pour ton score principal
rng = np.random.default_rng(42)
B = 300
n = len(df_test)
c_boot = []
for _ in range(B):
    idx = rng.integers(0, n, n)  # bootstrap avec remise
    c_boot.append(ipcw_at_t(t_star_clamped, idx))
c_boot = np.sort(np.array(c_boot))
lo, hi = c_boot[int(0.025*B)], c_boot[int(0.975*B)]
print(f"IPCW-C at t*={t_star_clamped:.3f}y: {ipcw_at_t(t_star_clamped):.4f} "
      f"[95% CI {lo:.4f}, {hi:.4f}]")

IPCW-C at t=0.727y: 0.6591
IPCW-C at t=1.395y: 0.6585
IPCW-C at t=2.749y: 0.6614
IPCW-C at t=4.328y: 0.6664
IPCW-C at t*=2.274y: 0.6582 [95% CI 0.5414, 0.7509]


In [10]:
# ======================================================
# Retrain Cox-Time on FULL df, then predict on df_new
# Output: DataFrame with columns ["ID", "Risk"]
# ======================================================

# Config for this step
ID_COL = "ID"
HORIZON_QUANTILE = 0.7  # choose t* from event-time quantile on full data

# Ensure these names match what you used earlier
target_time = "OS_YEARS"
target_event = "OS_STATUS"

# 1) Build feature set from FULL df
feature_cols_full = [c for c in df.columns if c not in [target_time, target_event]]
X_full = df[feature_cols_full].copy()
t_full = df[target_time].to_numpy(dtype=np.float32)
e_full = df[target_event].to_numpy(dtype=np.int64)

# 2) Preprocess on FULL df (float32)
num_cols_full = [c for c in X_full.columns if pd.api.types.is_numeric_dtype(X_full[c])]
cat_cols_full = [c for c in X_full.columns if c not in num_cols_full]
try:
    ohe_full = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.float32)
except TypeError:
    ohe_full = OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=np.float32)

pre_full = ColumnTransformer([
    ("num", StandardScaler(), num_cols_full),
    ("cat", ohe_full, cat_cols_full),
], remainder="drop")

pre_full.fit(X_full)
Xfull = pre_full.transform(X_full).astype(np.float32)

In [11]:
# 3) Cox-Time with MLPVanillaCoxTime on FULL data
net_full = MLPVanillaCoxTime(
    in_features=Xfull.shape[1],
    num_nodes=[256, 128],
    batch_norm=True,
    dropout=0.05,
)
optimizer_full = tt.optim.Adam(lr=3e-4, weight_decay=1e-4)
model_full = CoxTime(net_full, optimizer_full, device="cpu")

labtrans_full = CoxTime.label_transform()
y_full = labtrans_full.fit_transform(t_full, e_full)

_ = model_full.fit(Xfull, y_full, batch_size=256, epochs=120, callbacks=None, verbose=False)

# Baseline hazards are required before predict_surv_df
try:
    model_full.compute_baseline_hazards()
except TypeError:
    model_full.compute_baseline_hazards(Xfull, y_full)

  return self.tuple_.apply(lambda x: x[index])


In [13]:

# 4) Define horizon t* from FULL event times and clamp by tau from censoring KM
surv_full_pred = model_full.predict_surv_df(Xfull)  # just to grab the time grid
time_grid_full = surv_full_pred.index.values.astype(float)

event_times_full = t_full[e_full.astype(bool) == 1]
if event_times_full.size == 0:
    raise ValueError("No events in full data. Cannot define evaluation horizon t*.")

t_star_raw_full = float(np.quantile(event_times_full, HORIZON_QUANTILE))

# Tau from censoring KM on FULL data
t_km_full, Ghat_full = kaplan_meier_estimator(~e_full.astype(bool), t_full.astype(float)
)
if np.any(Ghat_full > 0):
    tau_max_full = t_km_full[Ghat_full > 0][-1]
    tau_safe_full = float(np.nextafter(tau_max_full, 0.0))
else:
    tau_safe_full = float(t_full.max())

def nearest_time(t, grid):
    g = np.asarray(grid, dtype=float)
    return float(g[np.argmin(np.abs(g - float(t)))])

t_star_full = nearest_time(min(t_star_raw_full, tau_safe_full), time_grid_full)



In [14]:

# 5) Load or reference df_new with ID and the SAME feature columns
df_new = pd.read_csv("../data/df_eval_scaled.csv")  # uncomment and set your path
assert ID_COL in df_new.columns, f"Missing ID column '{ID_COL}' in df_new."

missing_cols = [c for c in feature_cols_full if c not in df_new.columns]
if missing_cols:
    raise ValueError(f"df_new is missing required feature columns: {missing_cols}")

X_new = df_new[feature_cols_full].copy()
Xnew = pre_full.transform(X_new).astype(np.float32)

# 6) Predict survival on df_new and compute Risk = 1 - S(t*)
surv_new = model_full.predict_surv_df(Xnew)
grid_new = surv_new.index.values.astype(float)
use_t = nearest_time(min(t_star_full, tau_safe_full), grid_new)

risk_new = 1.0 - surv_new.loc[use_t].to_numpy()

# 7) Final DataFrame with exactly two columns: ID and Risk
scores = pd.DataFrame({ID_COL: df_new[ID_COL].values, "risk_score": risk_new})
print(scores.head())
scores.to_csv("risk_scores.csv", index=False)

     ID  risk_score
0  KYW1    1.000000
1  KYW2    1.000000
2  KYW3    0.131294
3  KYW4    1.000000
4  KYW5    1.000000
