# Random Survival Forest Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold

from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored, as_concordance_index_ipcw_scorer, as_cumulative_dynamic_auc_scorer, as_integrated_brier_score_scorer

import time
from datetime import datetime, timedelta

In [2]:
training_data_imputed_df = pd.read_csv("training_data_imputed_simple_TRAIN.csv.gz") 
# Filter to 30 day window
survival_data_30d = training_data_imputed_df.loc[
    (training_data_imputed_df['survival_time'] < 30) &
    (training_data_imputed_df['survival_time'] > 0)
    ]
survival_data_30d = survival_data_30d.astype({"cdiff_survival_flag": bool})

In [3]:
train_df, val_df, = train_test_split(survival_data_30d, test_size=0.2, random_state=0)

In [4]:
X_train = train_df.drop(['cdiff_2d_flag', 'cdiff_7d_flag', 'cdiff_30d_flag','cdiff_label_elisa_2d','cdiff_survival_flag','survival_time'], axis=1)
y_train = train_df[['cdiff_survival_flag', 'survival_time']]
y_array_event_time_train = np.array(
    list(zip(y_train['cdiff_survival_flag'], y_train['survival_time'])), 
    dtype=[('status', bool), ('survival_in_days', float)]
)
X_val = val_df.drop(['cdiff_2d_flag', 'cdiff_7d_flag', 'cdiff_30d_flag', 'cdiff_survival_flag', 'survival_time'], axis=1)
y_val = val_df[['cdiff_survival_flag', 'survival_time']]
y_array_event_time_val = np.array(
    list(zip(y_val['cdiff_survival_flag'], y_val['survival_time'])), 
    dtype=[('status', bool), ('survival_in_days', float)]
)

In [5]:
cdiff_X_train, cdiff_X_test, cdiff_y_train, cdiff_y_test = train_test_split(
    X_train, y_array_event_time_train, stratify=y_array_event_time_train['status'], random_state=1
)
lower, upper = np.percentile(y_array_event_time_train['survival_in_days'], [10, 90])
cdiff_times = np.arange(lower, upper + 1)

In [None]:
start_time = time.time()
start_datetime = datetime.now()
print(f"Training started at: {start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")

print("Starting GridSearchCV to find optimal parameters...")
rsf = RandomSurvivalForest(
    min_samples_leaf=2, n_jobs=-1, random_state=0, verbose=1
)

cv = KFold(n_splits=3, shuffle=True, random_state=1)
cv_param_grid = {
    "estimator__max_depth": np.arange(1, 5, dtype=int),
    "estimator__n_estimators": [2, 5, 10, 20],
    "estimator__min_samples_split": [20, 50]
}

cv_cindex = GridSearchCV(
    as_concordance_index_ipcw_scorer(rsf, tau=cdiff_times[-1]),
    param_grid=cv_param_grid,
    cv=cv,
).fit(X_train, y_array_event_time_train)

best_model_rsf = cv_cindex.best_estimator_

best_model_rsf_params = cv_cindex.best_params_
print(f"Best cross-validation C-index score: {cv_cindex.best_score_:.4f}")
print(f"Best cross-validated parameters:\n {best_model_rsf_params}")
end_time = time.time()
end_datetime = datetime.now()

duration_seconds = end_time - start_time
duration = timedelta(seconds=duration_seconds)

print(f"Training finished at: {end_datetime.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total training time: {duration}")

Training started at: 2025-05-24 16:06:10
Starting GridSearchCV to find optimal parameters...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel

In [1]:
rsf_best = RandomSurvivalForest(
    **best_model_rsf_params, n_jobs=-1, random_state=0
)
rsf_best.fit(X_train, y_array_event_time_train)

NameError: name 'RandomSurvivalForest' is not defined

In [None]:
c_index_val = rsf_best.score(X_val, y_array_event_time_val)
c_index_val