In [1]:
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline
import pandas as pd
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sksurv.datasets import load_flchain, load_gbsg2
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxnetSurvivalAnalysis, CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.preprocessing import OneHotEncoder, encode_categorical
from sksurv.util import Surv

import scipy.optimize as opt

set_config(display="text")  # displays text representation of estimators
plt.rcParams["figure.figsize"] = [7.2, 4.8]

In [2]:
def generate_marker(n_samples, m, hazard_ratio, baseline_hazard, rnd):
    # create synthetic risk score
    X = np.array(rnd.randn(n_samples, m))
    w = np.expand_dims(np.array(rnd.uniform(size=m)), axis=0).flatten() # weights

    # create linear model
    logits = np.dot(np.dot(X,w.T), np.log(hazard_ratio))

    # draw actual survival times from exponential distribution,
    # refer to Bender et al. (2005), https://doi.org/10.1002/sim.2059
    u = rnd.uniform(size=n_samples)
    time_event = -np.log(u) / (baseline_hazard * np.exp(logits))

    # compute the actual concordance in the absence of censoring
    Xactual = np.squeeze(np.dot(X, w.T))
    actual = concordance_index_censored(np.ones(n_samples, dtype=bool), time_event, Xactual)
    return X, time_event, actual[0], w # risk scores, time events, actual concordance, weights

In [3]:
def generate_survival_data(n_samples, m, hazard_ratio, baseline_hazard, percentage_cens, rnd):
    X, time_event, actual_c, w = generate_marker(n_samples, m, hazard_ratio, baseline_hazard, rnd)

    def get_observed_time(x): # this censors certain time events
        rnd_cens = np.random.RandomState(0)
        # draw censoring times
        time_censor = rnd_cens.uniform(high=x, size=n_samples)
        event = time_event < time_censor
        time = np.where(event, time_event, time_censor)
        return event, time # returns bool array of if event occured or not/censored, and the time it occured/ was censored

    def censoring_amount(x): # finds optimal time event censoring will be as close to desired censored amount
        event, _ = get_observed_time(x)
        cens = 1.0 - event.sum() / event.shape[0]
        return (cens - percentage_cens) ** 2

    # search for upper limit to obtain the desired censoring amount
    res = opt.minimize_scalar(censoring_amount, method="bounded", bounds=(0, time_event.max()))

    # compute observed time
    event, time = get_observed_time(res.x) # now that we have the optimal time event, we use that to get all the events and times

    # upper time limit such that the probability of being censored is non-zero for `t > tau`
    # we are finding the latest observed event time, and only keeping those events where time is < tau to decrease biases
    tau = time[event].max()
    y = Surv.from_arrays(event=event, time=time)
    mask = time < tau
    X_test = X[mask]
    y_test = y[mask]

    return X_test, y_test, y, actual_c, w # risk scores, event/time with tau applied, event/time without tau, actual c, weights. 

In [None]:
def simulation(n_samples, m, hazard_ratio, n_repeats=100):
    measures = (
        "censoring",
        "Harrel's C",
        "Uno's C",
        "Mean AUC",
    )
    data_mean = {}
    data_std = {}
    for measure in measures:
        data_mean[measure] = []
        data_std[measure] = []

    rnd = np.random.RandomState(seed=987)
    # iterate over different amount of censoring
    for cens in (0.1, 0.25, 0.4, 0.5, 0.6, 0.7):
        data = {
            "censoring": [],
            "Harrel's C": [],
            "Uno's C": [],
            "Mean AUC" : [],
        }

        # repeaditly perform simulation
        for _ in range(n_repeats):
            # generate data
            X_test, y_test, y_train, actual_c, w = generate_survival_data(
                n_samples, m, hazard_ratio, baseline_hazard=0.1, percentage_cens=cens, rnd=rnd
            )

            rsf = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, 
                    max_features="sqrt", n_jobs=-1, random_state=rnd)
            
            rsf.fit(X_test, y_test)

            # predict risk scores (lower predicted survival time = higher risk)
            risk_scores = -rsf.predict(X_test) # doing neg bec RSF does higher num = better survival time, but harrells c higher num = higher risk or earlier


            # random time points to check auc
            times = np.linspace(y_train["time"].min(), y_train["time"].max(), 50)

            print("y train", y_train["time"].max())
            print("y test", y_test["time"].max())
            print("times", times.max())

            
            # Compute cumulative dynamic AUC
            # this requires that survival times survival_test lie within the range of survival times survival_train
            _, aucs, _ = cumulative_dynamic_auc(y_train, y_test, risk_scores, times)

            # Take the mean AUC across all times
            mean_auc = np.mean(aucs)

            # estimate c-index
            c_harrell = concordance_index_censored(y_test["event"], y_test["time"], risk_scores)
            c_uno = concordance_index_ipcw(y_train, y_test, risk_scores)
            
            # save results
            data["censoring"].append(100.0 - y_test["event"].sum() * 100.0 / y_test.shape[0])
            data["Harrel's C"].append(actual_c - c_harrell[0])
            data["Uno's C"].append(actual_c - c_uno[0])
            data["Mean AUC"].append(mean_auc)

        # aggregate results
        for key, values in data.items():
            data_mean[key].append(np.mean(data[key]))
            data_std[key].append(np.std(data[key], ddof=1))

    data_mean = pd.DataFrame.from_dict(data_mean)
    data_std = pd.DataFrame.from_dict(data_std)
    return data_mean, data_std

In [5]:
def plot_results(data_mean, data_std, **kwargs):
    index = pd.Index(data_mean["censoring"].round(3), name="mean percentage censoring")
    for df in (data_mean, data_std):
        df.drop("censoring", axis=1, inplace=True)
        df.index = index

    ax = data_mean.plot.bar(yerr=data_std, **kwargs)
    ax.set_ylabel("Actual C - Estimated C")
    ax.yaxis.grid(True)
    ax.axhline(0.0, color="gray")
    return ax

In [6]:
data_mean, data_std = simulation(n_samples=100, m=3, hazard_ratio=2.0)
ylim = [-0.035, 0.5] 
plot_results(data_mean, data_std, ylim=ylim, figsize=(10, 6))


y train 68.64415691006276 (True, 14.97480562)
y test 58.62084022942785
times 68.64415691006276


ValueError: all times must be within follow-up time of test data: [0.0350555015279663; 58.62084022942785[