In [91]:
from sksurv.util import Surv
import numpy as np
import pandas as pd

from sksurv.metrics import concordance_index_censored
from sksurv.metrics import integrated_brier_score
from sklearn.model_selection import train_test_split

from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest

In [113]:
sps = pd.read_csv("../data/sps.csv")

sps.drop(columns=['dbsource', 'formulary_drug_cd_list', 'admission_location', 'admittime', 'dischtime',
                  'edregtime', 'edouttime', 'diagnosis', 'latest_gcs_time', 'comorbidities', 'proc.icd9_list'], inplace=True, errors='ignore')

# figure out how we're handling these patients later
sps.drop(columns=['gcs_total', 'gcs_verbal', 'gcs_motor', 'gcs_eye', 'language', 'drug.Miscellaneous', 'diag.Missing'], inplace=True, errors='ignore')

# drop patients with no provided age
sps.dropna(subset=["age.at.admit"], inplace=True)

df = pd.get_dummies(sps, columns=['gender', 'admission_type', 'insurance', 'religion', 'marital_status', 'ethnicity', 'discharge_location', 'intervention.group'])


# one hot encoding for these columns
df = pd.get_dummies(sps, columns=['gender', 'admission_type', 'insurance', 'religion', 'marital_status', 'ethnicity', 'discharge_location', 'intervention.group'])

ethnicity_groups = {
    'ethnicity_WHITE': ['ethnicity_WHITE', 'ethnicity_WHITE - BRAZILIAN', 'ethnicity_WHITE - OTHER EUROPEAN', 'ethnicity_WHITE - RUSSIAN', 'ethnicity_MIDDLE EASTERN'],
    'ethnicity_BLACK': ['ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_BLACK/CAPE VERDEAN', 'ethnicity_BLACK/HAITIAN'],
    'ethnicity_HISPANIC_LATINO': ['ethnicity_HISPANIC OR LATINO', 'ethnicity_HISPANIC/LATINO - COLOMBIAN', 'ethnicity_HISPANIC/LATINO - DOMINICAN', 'ethnicity_HISPANIC/LATINO - PUERTO RICAN'],
    'ethnicity_ASIAN': ['ethnicity_ASIAN', 'ethnicity_ASIAN - ASIAN INDIAN', 'ethnicity_ASIAN - CAMBODIAN', 'ethnicity_ASIAN - CHINESE', 'ethnicity_ASIAN - JAPANESE', 'ethnicity_ASIAN - OTHER', 'ethnicity_ASIAN - VIETNAMESE'],
    'ethnicity_OTHER': [ 'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_MULTI RACE ETHNICITY', 'ethnicity_OTHER'],
    'ethnicity_UNKNOWN': [ 'ethnicity_PATIENT DECLINED TO ANSWER', 'ethnicity_UNABLE TO OBTAIN', 'ethnicity_UNKNOWN/NOT SPECIFIED']
}

# Combine columns based on the mappings
for new_col, cols_to_merge in ethnicity_groups.items():
    df[new_col] = df[cols_to_merge].any(axis=1).astype(int)

# Drop the original detailed ethnicity columns
#all_original_cols = [col for cols in ethnicity_groups.values() for col in cols]
df.drop(columns=['ethnicity_WHITE - BRAZILIAN', 'ethnicity_WHITE - OTHER EUROPEAN', 'ethnicity_WHITE - RUSSIAN', 'ethnicity_MIDDLE EASTERN'], inplace=True)
df.drop(columns=['ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_BLACK/CAPE VERDEAN', 'ethnicity_BLACK/HAITIAN'], inplace=True)
df.drop(columns=['ethnicity_HISPANIC OR LATINO', 'ethnicity_HISPANIC/LATINO - COLOMBIAN', 'ethnicity_HISPANIC/LATINO - DOMINICAN', 'ethnicity_HISPANIC/LATINO - PUERTO RICAN'], inplace=True)
df.drop(columns=['ethnicity_ASIAN - ASIAN INDIAN', 'ethnicity_ASIAN - CAMBODIAN', 'ethnicity_ASIAN - CHINESE', 'ethnicity_ASIAN - JAPANESE', 'ethnicity_ASIAN - OTHER', 'ethnicity_ASIAN - VIETNAMESE'], inplace=True)
df.drop(columns=['ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_MULTI RACE ETHNICITY', 'ethnicity_UNABLE TO OBTAIN', 'ethnicity_PATIENT DECLINED TO ANSWER', 'ethnicity_UNKNOWN/NOT SPECIFIED'], inplace=True)



df = df[df['survival_days'] > 0]

# survived if > a certain value
df['survived_90'] = df['survival_days'] > 90

# doesn't work with NA columns
incomplete_cols = df.columns[df.isna().sum() != 0]
complete_cols = df.columns[df.isna().sum() == 0]

df[incomplete_cols] = df[incomplete_cols].fillna(0)

# split into cov/predictors and target (also some identifiers removed from X)
#X = df[complete_cols].drop(columns=['subject_id', 'survival_days', 'survived_90', 'event'])
X = df.drop(columns=['subject_id', 'survival_days', 'survived_90', 'event'])
y = df['survived_90']

In [114]:
list(X.columns)

['diag.H5',
 'diag.I1',
 'diag.S1',
 'diag.H6',
 'diag.M1',
 'diag.H4',
 'diag.N1',
 'diag.H3',
 'diag.H1',
 'diag.H2',
 'comor.htn',
 'comor.dm',
 'comor.ckd',
 'comor.chf',
 'comor.copd',
 'comor.resp_fail',
 'comor.liver_cirr',
 'comor.hep_c',
 'comor.cancer',
 'comor.anemia',
 'comor.depression',
 'comor.pulm_ht',
 'comor.dehydration',
 'comor.pain',
 'comor.pud',
 'comor.vte',
 'comor.pad',
 'drug.Electrolytes.IV.Fluids',
 'drug.Diuretics',
 'drug.PPIs.H2Blockers',
 'drug.Uncategorized',
 'drug.Vasodilators',
 'drug.Non.Opioid.Analgesics',
 'drug.Antiepileptics',
 'drug.Opioid.Analgesics',
 'drug.Antidiabetics',
 'drug.Beta.Blockers',
 'drug.Anesthetics...Sedatives',
 'drug.Laxatives',
 'drug.Antibiotics',
 'drug.Calcium.Channel.Blockers',
 'drug.ACE.ARBs',
 'drug.Antiemetics',
 'drug.Anticoagulants',
 'drug.Steroids',
 'drug.Antidepressants',
 'drug.Anxiolytics.Hypnotics',
 'drug.Parkinsons.Dementia',
 'drug.Ophthalmic',
 'drug.Vitamins.Supplements',
 'drug.Respiratory',
 'drug.A

In [129]:


# Format the target: (event_observed, time)
y_structured = Surv.from_dataframe("event", "survival_days", df)

# Features
X = df.drop(columns=["event", "survival_days", "subject_id", "survived_90"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_structured, test_size=0.2, random_state=7)

# Model
model = GradientBoostingSurvivalAnalysis(n_estimators=100, random_state=42)
#model = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=7)
model.fit(X_train, y_train)

# Time points you want predictions for
time_points = np.array([90, 360, 720, 1460])

# Predict survival probabilities for each time point
surv_funcs = model.predict_survival_function(X_test)

# Example: Get probabilities for first individual
for i, fn in enumerate(surv_funcs[:20]):
    surv_probs = fn(time_points)
    print(f"Patient {i} survival probabilities:")
    for t, p in zip(time_points, surv_probs):
        print(f"  P(survival > {t} days) = {p:.2f}")


Patient 0 survival probabilities:
  P(survival > 90 days) = 0.96
  P(survival > 360 days) = 0.89
  P(survival > 720 days) = 0.85
  P(survival > 1460 days) = 0.78
Patient 1 survival probabilities:
  P(survival > 90 days) = 0.97
  P(survival > 360 days) = 0.93
  P(survival > 720 days) = 0.90
  P(survival > 1460 days) = 0.86
Patient 2 survival probabilities:
  P(survival > 90 days) = 0.97
  P(survival > 360 days) = 0.93
  P(survival > 720 days) = 0.90
  P(survival > 1460 days) = 0.85
Patient 3 survival probabilities:
  P(survival > 90 days) = 0.94
  P(survival > 360 days) = 0.85
  P(survival > 720 days) = 0.79
  P(survival > 1460 days) = 0.71
Patient 4 survival probabilities:
  P(survival > 90 days) = 0.84
  P(survival > 360 days) = 0.63
  P(survival > 720 days) = 0.52
  P(survival > 1460 days) = 0.38
Patient 5 survival probabilities:
  P(survival > 90 days) = 0.98
  P(survival > 360 days) = 0.94
  P(survival > 720 days) = 0.91
  P(survival > 1460 days) = 0.87
Patient 6 survival probabili

In [123]:
list(X.columns)

['diag.H5',
 'diag.I1',
 'diag.S1',
 'diag.H6',
 'diag.M1',
 'diag.H4',
 'diag.N1',
 'diag.H3',
 'diag.H1',
 'diag.H2',
 'comor.htn',
 'comor.dm',
 'comor.ckd',
 'comor.chf',
 'comor.copd',
 'comor.resp_fail',
 'comor.liver_cirr',
 'comor.hep_c',
 'comor.cancer',
 'comor.anemia',
 'comor.depression',
 'comor.pulm_ht',
 'comor.dehydration',
 'comor.pain',
 'comor.pud',
 'comor.vte',
 'comor.pad',
 'drug.Electrolytes.IV.Fluids',
 'drug.Diuretics',
 'drug.PPIs.H2Blockers',
 'drug.Uncategorized',
 'drug.Vasodilators',
 'drug.Non.Opioid.Analgesics',
 'drug.Antiepileptics',
 'drug.Opioid.Analgesics',
 'drug.Antidiabetics',
 'drug.Beta.Blockers',
 'drug.Anesthetics...Sedatives',
 'drug.Laxatives',
 'drug.Antibiotics',
 'drug.Calcium.Channel.Blockers',
 'drug.ACE.ARBs',
 'drug.Antiemetics',
 'drug.Anticoagulants',
 'drug.Steroids',
 'drug.Antidepressants',
 'drug.Anxiolytics.Hypnotics',
 'drug.Parkinsons.Dementia',
 'drug.Ophthalmic',
 'drug.Vitamins.Supplements',
 'drug.Respiratory',
 'drug.A

In [126]:
rsf = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=7)
#rsf = GradientBoostingSurvivalAnalysis(n_estimators=100, random_state=7)
rsf.fit(X_train, y_train)

# Predict risk scores (lower survival = higher risk)
risk_scores = rsf.predict(X_test)  # Lower = higher survival

# Evaluate with C-index
event_observed = y_test["event"]
time = y_test["survival_days"]

c_index, concordant, permissible, _, _ = concordance_index_censored(
    event_observed,
    time,
    risk_scores  # negate because higher survival = lower risk
)

print(f"C-index: {c_index:.4f}")

C-index: 0.7849


In [127]:
print(concordance_index_censored(
    event_observed,
    time,
    risk_scores  # negate because higher survival = lower risk
))

(0.7849393746011487, 1230, 337, 0, 0)


In [119]:
y_train[]

SyntaxError: invalid syntax (51757481.py, line 1)

In [128]:
# 1. Define a fine-grained time grid for evaluation
# Use the range of observed survival times in the test set
test_times = np.arange(1.216, int(np.max(X_test.shape[0])))

# 2. Predict survival functions for test data
surv_fns = model.predict_survival_function(X_test)

# 3. Convert list of survival functions to 2D array: [n_samples, n_time_points]
# At each time point, get the survival prob for each patient
pred_surv_probs = np.asarray([[fn(t) for t in test_times] for fn in surv_fns])

# 4. Compute Integrated Brier Score
ibs = integrated_brier_score(y_train, y_test, pred_surv_probs, test_times)

print(f"Integrated Brier Score: {ibs:.4f}")

Integrated Brier Score: 0.0363
