# Synthetic Risk Score Validation Notebook

This notebook provides a **rigorous validation** of a **synthetic survival risk score**.

The goal is to evaluate how well a single feature (e.g., a Cox-based linear predictor)
predicts time-to-event outcomes.

## Assumptions
- The dataset is in a CSV file with at least the following columns:
  - `OS_YEARS`: survival time in years.
  - `OS_STATUS`: event indicator (1 = event, 0 = censored).
  - `MRS`: continuous synthetic risk score (higher = worse prognosis).

---
### What this notebook will compute
1. Descriptive analysis of the risk score.
2. Kaplanâ€“Meier curves by risk quartiles + log-rank test.
3. Univariate Cox proportional hazards model for the risk score.
4. Bootstrap confidence intervals for the C-index.
5. K-fold cross-validated C-index, AUC(t), and Integrated Brier Score (IBS)
   for:
   - a baseline model (no MRS),
   - a Cox model with MRS
6. Permutation test.
7. Calibration plot at a chosen time horizon.
8. Summary + deltas (with - without MRS).


In [1]:
# ===============================
# 0. Imports & configuration
# ===============================
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.statistics import multivariate_logrank_test
from lifelines.utils import concordance_index

from sksurv.util import Surv
from sksurv.metrics import (
    concordance_index_censored,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sksurv.nonparametric import kaplan_meier_estimator

from sklearn.model_selection import KFold
import plotly.graph_objects as go

plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['axes.grid'] = True

DATA_PATH = '../../data/train_enhanced.csv'
TIME_COL = 'OS_YEARS'
EVENT_COL = 'OS_STATUS'
RISK_COL = 'RiskScore'

AUC_TIMES = np.array([1.0, 2.0, 3.0])
CALIBRATION_TIME = 2.0

RANDOM_STATE = 42
N_BOOTSTRAP = 1000
N_SPLITS_CV = 5

np.random.seed(RANDOM_STATE)

## 1. Load and inspect data

In [2]:
df = pd.read_csv(DATA_PATH)
df = df[[TIME_COL, EVENT_COL, RISK_COL]].copy().dropna()
df[EVENT_COL] = df[EVENT_COL].astype(int)
print('Data shape:', df.shape)
df.head()

Data shape: (3173, 3)


Unnamed: 0,OS_YEARS,OS_STATUS,RiskScore
0,1.115068,1,1.000392
1,4.928767,0,1.000151
2,2.043836,0,-5.5e-05
3,2.476712,1,1.000328
4,3.145205,0,0.0


In [3]:
print(df[RISK_COL].describe())
fig = go.Figure()
fig.add_trace(go.Histogram(x=df[RISK_COL], nbinsx=30, name='Risk score'))
fig.update_layout(title='Distribution of synthetic risk score',
                  xaxis_title='Risk score', yaxis_title='Count',
                  template='simple_white', bargap=0.05)
fig.show()

count    3173.000000
mean        4.812185
std        12.870150
min        -0.000252
25%        -0.000102
50%         0.000000
75%         0.999898
max        41.275175
Name: RiskScore, dtype: float64


In [4]:
df = df.sort_values(RISK_COL).reset_index(drop=True)
df['risk_quartile'] = pd.qcut(
    df[RISK_COL],
    q=4,
    labels=['Q1 (lowest)','Q2','Q3','Q4 (highest)']
)

kmf = KaplanMeierFitter()
fig = go.Figure()

colors = [
    'rgba(31,119,180,1)',
    'rgba(44,160,44,1)',
    'rgba(255,127,14,1)',
    'rgba(214,39,40,1)',
]

for color, q in zip(colors, df['risk_quartile'].cat.categories):
    mask = df['risk_quartile'] == q

    kmf.fit(
        df.loc[mask, TIME_COL],
        df.loc[mask, EVENT_COL],
        label=str(q),
    )

    sf = kmf.survival_function_
    ci = kmf.confidence_interval_

    t = sf.index.values
    s = sf.iloc[:, 0].values
    lower = ci.iloc[:, 0].values
    upper = ci.iloc[:, 1].values

    # --- borne basse ---
    fig.add_trace(go.Scatter(
        x=t,
        y=lower,
        mode='lines',
        line=dict(width=0),
        showlegend=False,
        hoverinfo='skip',
    ))

    # --- borne haute + remplissage ---
    fig.add_trace(go.Scatter(
        x=t,
        y=upper,
        mode='lines',
        line=dict(width=0),
        fill='tonexty',
        fillcolor=color.replace('1)', '0.15)'),
        showlegend=False,
        hoverinfo='skip',
    ))

    # --- courbe KM ---
    fig.add_trace(go.Scatter(
        x=t,
        y=s,
        mode='lines',
        line=dict(color=color, width=2),
        name=str(q),
    ))

fig.update_layout(
    title='KM survival by risk quartile (95% CI)',
    xaxis_title='Time',
    yaxis_title='Survival probability',
    template='simple_white'
)

fig.show()

lr = multivariate_logrank_test(
    df[TIME_COL],
    df['risk_quartile'],
    df[EVENT_COL]
)
print(lr)


<lifelines.StatisticalResult: multivariate_logrank_test>
               t_0 = -1
 null_distribution = chi squared
degrees_of_freedom = 3
         test_name = multivariate_logrank_test

---
 test_statistic      p  -log2(p)
         600.77 <0.005    429.07


In [5]:
cph = CoxPHFitter()
cph.fit(df[[TIME_COL, EVENT_COL, RISK_COL]], duration_col=TIME_COL, event_col=EVENT_COL)
cph.print_summary()
c_index_in_sample = cph.concordance_index_
print("In-sample C-index =", c_index_in_sample)

0,1
model,lifelines.CoxPHFitter
duration col,'OS_YEARS'
event col,'OS_STATUS'
baseline estimation,breslow
number of observations,3173
number of events observed,1600
partial log-likelihood,-11482.50
time fit was run,2025-12-30 05:22:25 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
RiskScore,0.03,1.03,0.0,0.02,0.03,1.02,1.03,0.0,15.35,<0.005,174.3

0,1
Concordance,0.70
Partial AIC,22967.01
log-likelihood ratio test,188.34 on 1 df
-log2(p) of ll-ratio test,139.97


In-sample C-index = 0.6997851212427877


In [6]:
from lifelines.utils import concordance_index

from lifelines.utils import concordance_index

def bootstrap_c_index_ci(df, time_col, event_col, risk_col,
                         n_boot=1000, random_state=42):
    rng = np.random.RandomState(random_state)
    n = df.shape[0]
    cvals = []

    for _ in range(n_boot):
        idx = rng.randint(0, n, size=n)
        s = df.iloc[idx]

        c = concordance_index(
            s[time_col],
            -s[risk_col],          # plus de risque = moins de survie
            s[event_col],
        )
        cvals.append(c)

    cvals = np.array(cvals)

    mean_c = float(cvals.mean())
    std_c  = float(cvals.std(ddof=1))

    lower = float(np.percentile(cvals, 2.5))
    upper = float(np.percentile(cvals, 97.5))

    return mean_c, std_c, lower, upper, cvals


mean_c, std_c, low_c, high_c, boot = bootstrap_c_index_ci(
    df,
    time_col=TIME_COL,
    event_col=EVENT_COL,
    risk_col=RISK_COL,
    n_boot=N_BOOTSTRAP,
    random_state=RANDOM_STATE,
)

print(f"Bootstrap mean C-index   = {mean_c:.3f}")
print(f"Bootstrap std            = {std_c:.3f}")
print(f"95% CI (percentile)     = [{low_c:.3f}, {high_c:.3f}]")

fig=go.Figure()
fig.add_trace(go.Histogram(x=boot,nbinsx=30))
fig.update_layout(title='Bootstrap C-index',template='simple_white')
fig.show()

Bootstrap mean C-index   = 0.699
Bootstrap std            = 0.007
95% CI (percentile)     = [0.686, 0.712]


In [7]:
# ===== 5. Cross-validation =====
y_struct = Surv.from_dataframe(event=EVENT_COL, time=TIME_COL, data=df)
kf = KFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_STATE)
event_field, time_field = y_struct.dtype.names

cindex_with = []; ibs_with = []; auc_with = []
cindex_base = []; ibs_base = []; auc_base = []

for tr, te in kf.split(df):
    df_tr = df.iloc[tr]
    df_te = df.iloc[te]
    y_tr = y_struct[tr]
    y_te = y_struct[te]

    # ===== Cox with MRS =====
    m = CoxPHFitter()
    m.fit(
        df_tr[[TIME_COL, EVENT_COL, RISK_COL]],
        duration_col=TIME_COL,
        event_col=EVENT_COL,
    )

    # scores de risque sur le fold test
    risk = m.predict_partial_hazard(df_te[[RISK_COL]]).values.ravel()

    # C-index (avec MRS)
    cindex_with.append(
        concordance_index_censored(
            y_te[event_field].astype(bool),
            y_te[time_field],
            risk,
        )[0]
    )

    # Survie prÃ©dite sur le test
    surv = m.predict_survival_function(df_te[[RISK_COL]])
    times = surv.index.values
    preds = surv.T.values   # shape (n_test, n_times)

    # ===== IBS (avec RiskScore) =====
    max_t = y_tr[time_field].max()
    mask = y_te[time_field] < max_t
    if mask.sum() > 0:
        y_te_ib = y_te[mask]
        preds_ib = preds[mask]
        tmin = y_te_ib[time_field].min()
        tmax = y_te_ib[time_field].max()
        mt = (times >= tmin) & (times < tmax)
        if mt.sum() >= 2:
            ibs_with.append(
                integrated_brier_score(
                    y_tr, y_te_ib, preds_ib[:, mt], times[mt]
                )
            )
        else:
            ibs_with.append(np.nan)
    else:
        ibs_with.append(np.nan)

    # ===== AUC(t) (avec RiskScore) =====
    auc_vec = np.full(len(AUC_TIMES), np.nan)
    tt = y_te[time_field]
    ok = AUC_TIMES[(AUC_TIMES >= max(tt.min(), 1e-8)) & (AUC_TIMES < tt.max())]

    if len(ok) > 0:
        try:
            et, av = cumulative_dynamic_auc(y_tr, y_te, risk, ok)
            et = np.atleast_1d(et)
            av = np.atleast_1d(av)

            for t0, a0 in zip(et, av):
                idx = np.where(np.isclose(AUC_TIMES, t0, rtol=1e-6, atol=1e-8))[0]
                if len(idx) > 0:
                    auc_vec[idx[0]] = a0
        except ValueError:
            # cas oÃ¹ la fonction de survie du censoring tombe Ã  0 -> on ignore ce fold pour l'AUC
            pass

    auc_with.append(auc_vec)

    # ===== Baseline KM (sans RiskScore) =====
    ttr, strr = kaplan_meier_estimator(
        y_tr[event_field].astype(bool),
        y_tr[time_field],
    )

    risk0 = np.zeros_like(y_te[time_field])

    # C-index baseline
    cindex_base.append(
        concordance_index_censored(
            y_te[event_field].astype(bool),
            y_te[time_field],
            risk0,
        )[0]
    )

    # IBS baseline
    max_t2 = y_tr[time_field].max()
    mask2 = y_te[time_field] < max_t2
    if mask2.sum() > 0:
        y_te2 = y_te[mask2]
        tmin2 = y_te2[time_field].min()
        tmax2 = y_te2[time_field].max()
        mt2 = (ttr >= tmin2) & (ttr < tmax2)
        if mt2.sum() >= 2:
            pred = np.tile(strr[mt2], (len(y_te2), 1))
            ibs_base.append(
                integrated_brier_score(
                    y_tr, y_te2, pred, ttr[mt2]
                )
            )
        else:
            ibs_base.append(np.nan)
    else:
        ibs_base.append(np.nan)

    # ===== AUC(t) baseline =====
    auc_vec2 = np.full(len(AUC_TIMES), np.nan)
    if len(ok) > 0:
        try:
            et2, av2 = cumulative_dynamic_auc(y_tr, y_te, risk0, ok)
            et2 = np.atleast_1d(et2)
            av2 = np.atleast_1d(av2)

            for t0, a0 in zip(et2, av2):
                idx = np.where(np.isclose(AUC_TIMES, t0, rtol=1e-6, atol=1e-8))[0]
                if len(idx) > 0:
                    auc_vec2[idx[0]] = a0
        except ValueError:
            # mÃªme problÃ¨me d'IPCW -> on laisse NaN pour ce fold
            pass

    auc_base.append(auc_vec2)

# Conversion en arrays
cindex_with  = np.array(cindex_with)
ibs_with     = np.array(ibs_with)
auc_with     = np.array(auc_with)

cindex_base  = np.array(cindex_base)
ibs_base     = np.array(ibs_base)
auc_base     = np.array(auc_base)

print('WITH RiskScore C-index mean =', np.nanmean(cindex_with))
print('BASELINE C-index mean       =', np.nanmean(cindex_base))
print('WITH RiskScore IBS mean     =', np.nanmean(ibs_with))
print('BASELINE IBS mean           =', np.nanmean(ibs_base))


WITH RiskScore C-index mean = 0.7001498410348338
BASELINE C-index mean       = 0.5
WITH RiskScore IBS mean     = 0.17723300506107362
BASELINE IBS mean           = 0.17910456076689651


In [8]:
# ===== 6. Permutation test =====
def permutation_test(df,n_perm=500):
    rng=np.random.RandomState(42); vals=[]
    for _ in range(n_perm):
        s=df.copy()
        s[RISK_COL]=rng.permutation(s[RISK_COL].values)
        m=CoxPHFitter()
        m.fit(s[[TIME_COL,EVENT_COL,RISK_COL]],duration_col=TIME_COL,event_col=EVENT_COL)
        vals.append(m.concordance_index_)
    return np.array(vals)

perm=permutation_test(df,500)
true_cindex=c_index_in_sample
print(true_cindex,perm.mean())

fig=go.Figure()
fig.add_trace(go.Histogram(x=perm,nbinsx=30,opacity=0.75))
fig.add_trace(go.Scatter(x=[true_cindex,true_cindex],y=[0,len(perm)/3],mode='lines',name='True'))
fig.update_layout(template='simple_white')
fig.show()

0.6997851212427877 0.5035292463056013


In [9]:
# ===== 7. Calibration =====
m=CoxPHFitter()
m.fit(df[[TIME_COL,EVENT_COL,RISK_COL]],duration_col=TIME_COL,event_col=EVENT_COL)
surv=m.predict_survival_function(df[[RISK_COL]])
times=surv.index.values

idx=np.searchsorted(times,CALIBRATION_TIME,side='right')-1
t_eff=times[idx]
pred=surv.iloc[idx].values

n_bins=10
q=np.quantile(pred,np.linspace(0,1,n_bins+1))
bid=np.digitize(pred,q[1:-1],right=True)

bx=[]; by=[]
for b in range(n_bins):
    mask=bid==b
    if mask.sum()<10: continue
    bx.append(pred[mask].mean())

    t,s=kaplan_meier_estimator(df.loc[mask,EVENT_COL].values.astype(bool),
                               df.loc[mask,TIME_COL].values)
    if (t<=t_eff).any(): by.append(s[t<=t_eff][-1])
    else: by.append(1.0)

fig=go.Figure()
fig.add_trace(go.Scatter(x=bx,y=by,mode='markers+lines',name='Observed'))
fig.add_trace(go.Scatter(x=[0,1],y=[0,1],mode='lines',name='Perfect'))
fig.update_layout(title=f'Calibration at t={t_eff:.2f}',template='simple_white')
fig.show()

In [10]:
# ===== 8. Summary + deltas =====
def summarize(results,label):
    c=np.array(results['c_index']); i=np.array(results['ibs']); a=results['auc_per_time']
    times=np.array(results['times_auc'],dtype=float)
    out={'label':label,'c_index_mean':float(np.nanmean(c)),'ibs_mean':float(np.nanmean(i))}
    if len(a)>0:
        aa=np.array(a); 
        for k,t in enumerate(times):
            out[f'auc_{t:.2f}_mean']=float(np.nanmean(aa[:,k]))
    return pd.Series(out)

results_with={'c_index':cindex_with,'ibs':ibs_with,'auc_per_time':auc_with,'times_auc':AUC_TIMES}
results_no={'c_index':cindex_base,'ibs':ibs_base,'auc_per_time':auc_base,'times_auc':AUC_TIMES}

s_no=summarize(results_no,'no_risk')
s_with=summarize(results_with,'with_risk')

print(pd.concat([s_no,s_with],axis=1))

delta=pd.Series({'delta_c_index':s_with['c_index_mean']-s_no['c_index_mean'],
                 'delta_ibs':s_with['ibs_mean']-s_no['ibs_mean']})
print(delta)

                      0          1
label           no_risk  with_risk
c_index_mean        0.5    0.70015
ibs_mean       0.179105   0.177233
auc_1.00_mean       NaN        NaN
auc_2.00_mean       NaN        NaN
auc_3.00_mean       NaN        NaN
delta_c_index    0.200150
delta_ibs       -0.001872
dtype: float64



Mean of empty slice

