In [1]:
#!pip download lifelines
#%pip install input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
#%pip install input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
#%pip install input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
#%pip install input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
#%pip install input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from scipy.stats import rankdata 
import numpy as np
from tqdm import tqdm

import lightgbm as lgb
from lightgbm import LGBMRegressor

from scipy.stats import rankdata 
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from lifelines import KaplanMeierFitter, NelsonAalenFitter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb

from metric import score

In [None]:
# set analysis output directory
def create_output_directory(output_path):
    """Create the output directory if it doesn't exist and set plotting style."""
    os.makedirs(output_path, exist_ok=True)
    return output_path

output_path = 'working/analysis'
create_output_directory(output_path)

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

test = pd.read_csv("input/data/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("input/data/train.csv")
print("Train shape:",train.shape)
train.head()

In [None]:
# Train targets
plt.hist(train.loc[train.efs==1,"efs_time"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"efs_time"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Time of Observation, efs_time")
plt.ylabel("Density")
plt.title("Times of Observation. Either time to event, or time observed without event.")
plt.legend()
plt.savefig(f'{output_path}times_of_observation.png')
plt.show()

In [None]:
# Transform Two Targets into One Target with KaplanMeier
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

plt.hist(train.loc[train.efs==1,"y"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y")
plt.ylabel("Density")
plt.title("KaplanMeier Transformed Target y using both efs and efs_time.")
plt.legend()
plt.savefig(f'{output_path}kaplanmeier_transformed_target_y.png')
plt.show()

In [None]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"Number of Features: {len(FEATURES)} FEATURES: {FEATURES}")

In [None]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In [None]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [10]:
SEED = 9365

def perform_pca(train, test, n_components=None, random_state=42):
    # Remove rows with NaN values from both datasets
    train = train.dropna()
    test = test.dropna()

    pca = PCA(n_components=n_components, random_state=random_state)
    train_pca = pca.fit_transform(train)
    test_pca = pca.transform(test)
    
    explained_variance_ratio = pca.explained_variance_ratio_
    print(f"Explained variance ratio of the components:\n {explained_variance_ratio}")
    print(np.sum(explained_variance_ratio))
    
    train_pca_df = pd.DataFrame(train_pca, columns=[f'PC_{i+1}' for i in range(train_pca.shape[1])])
    test_pca_df = pd.DataFrame(test_pca, columns=[f'PC_{i+1}' for i in range(test_pca.shape[1])])
    
    return train_pca_df, test_pca_df, pca

In [None]:
# PCA 
# Extract the numerical columns to be used in the PCA
train_num = train.drop('ID', axis=1)
test_num = test.drop('ID', axis=1)

# Get numeric and categorical columns
numeric_columns = train.select_dtypes(include=['int32', 'float32']).columns
categorical_columns = train.select_dtypes(exclude=['int32', 'float32']).columns

# Split into numeric and categorical dataframes
train_numeric = train_num[numeric_columns]
test_numeric = test_num[numeric_columns]
train_categorical = train[categorical_columns]
test_categorical = test[categorical_columns]

# Scale the numeric columns
scaler = StandardScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_numeric),
    columns=train_numeric.columns
)
test_scaled = pd.DataFrame(
    scaler.transform(test_numeric),
    columns=test_numeric.columns
)

train_pca, test_pca, pca = perform_pca(train_scaled, test_scaled, n_components=15, random_state=SEED)

# Merge scaled numeric data with categorical data
train_final = pd.concat([train_scaled, train_categorical, train_pca], axis=1)
test_final = pd.concat([test_scaled, test_categorical, test_pca], axis=1)

In [None]:
train_final.head()

In [13]:
test = test_final
train = train_final

# add pca columns to features list
FEATURES.extend(train_pca.columns.tolist())

In [14]:
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    'vent_hist',
    'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    'hla_match_c_low',
    'rituximab',
    'hla_match_drb1_low',
    'hla_match_dqb1_low',
    'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    'pulm_moderate',
    'hla_low_res_10',
    'PC_1',
    'PC_2',
    'PC_3',
    'PC_4',
    'PC_5',
    'PC_6',
    'PC_7',
    'PC_8',
    'PC_9',
    'PC_10',
    'PC_11',
    'PC_12',
    'PC_13',
    'PC_14',
    'PC_15'
    ]

In [15]:
# Feature Engineering
def feature_engineering(df):

    return df

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

In [None]:
# XGBoost with KaplanMeier
print("Using XGBoost version",xgb.__version__)

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=3,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.02,  
        enable_categorical=True,
        min_child_weight=80,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],  
        verbose=500 
    )

    # INFER OOF
    oof_xgb[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

In [None]:
# XGBoost with KaplanMeier
print("Using XGBoost version",xgb.__version__)

In [19]:
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    #'vent_hist',
    'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    #'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    'hla_match_c_low',
    #'rituximab',
    'hla_match_drb1_low',
    'hla_match_dqb1_low',
    'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    #'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    #'pulm_moderate',
    'hla_low_res_10',
    #'PC_1',
    #'PC_2',
    #'PC_3',
    #'PC_4',
    #'PC_5',
    #'PC_6',
    #'PC_7',
    #'PC_8',
    #'PC_9',
    'PC_10'
    #'PC_11',
    #'PC_12',
    #'PC_13',
    #'PC_14',
    #'PC_15'
    ]

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=3,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.02,  
        enable_categorical=True,
        min_child_weight=80,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],  
        verbose=500 
    )

    # INFER OOF
    oof_xgb[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost KaplanMeier =",m)

In [None]:
feature_importance = model_xgb.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  # Replace FEATURES with your list of feature names
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(15, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.savefig(f'{output_path}xgboost_km_feature_importance.png')
plt.show()

In [None]:
# CatBoost with KaplanMeier
print("Using CatBoost version",cb.__version__)

In [24]:
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    'vent_hist',
    #'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    #'hla_match_c_low',
    'rituximab',
    #'hla_match_drb1_low',
    'hla_match_dqb1_low',
    #'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    #'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    #'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    #'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    'pulm_moderate',
    'hla_low_res_10',
    'PC_1',
    'PC_2',
    'PC_3',
    'PC_4',
    'PC_5',
    'PC_6',
    'PC_7',
    'PC_8',
    'PC_9',
    'PC_10',
    'PC_11',
    'PC_12',
    'PC_13',
    'PC_14',
    'PC_15'
    ]

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_cat = np.zeros(len(train))
pred_cat = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",  
        learning_rate=0.1,    
        grow_policy='Lossguide',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier =",m)

In [None]:
feature_importance = model_cat.get_feature_importance()
importance_df = pd.DataFrame({
    "Feature": FEATURES, 
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(15, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("CatBoost KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.savefig(f'{output_path}catboost_km_feature_importance.png')
plt.show()

In [None]:
# LightGBM with KaplanMeier
print("Using LightGBM version",lgb.__version__)

In [29]:
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    'vent_hist',
    'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    'hla_match_c_low',
    'rituximab',
    'hla_match_drb1_low',
    'hla_match_dqb1_low',
    'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    'pulm_moderate',
    'hla_low_res_10',
    'PC_1',
    'PC_2',
    'PC_3',
    'PC_4',
    'PC_5',
    'PC_6',
    'PC_7',
    'PC_8',
    'PC_9',
    'PC_10',
    'PC_11',
    'PC_12',
    'PC_13',
    'PC_14',
    'PC_15'
    ]

In [None]:
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_lgb = np.zeros(len(train))
pred_lgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_lgb = LGBMRegressor(
        device="gpu", 
        max_depth=3, 
        colsample_bytree=0.4,  
        #subsample=0.9, 
        n_estimators=2500, 
        learning_rate=0.02, 
        objective="regression", 
        verbose=-1, 
        #early_stopping_rounds=25,
    )
    model_lgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
    )
    
    # INFER OOF
    oof_lgb[test_index] = model_lgb.predict(x_valid)
    # INFER TEST
    pred_lgb += model_lgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_lgb /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_lgb
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for LightGBM KaplanMeier =",m)

In [None]:
feature_importance = model_lgb.feature_importances_ 
importance_df = pd.DataFrame({
    "Feature": FEATURES,
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(15, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"], color='skyblue')
plt.xlabel("Importance (Gain)")
plt.ylabel("Feature")
plt.title("LightGBM KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.savefig(f'{output_path}lightgbm_km_feature_importance.png')
plt.show()

In [33]:
# XGBoost with Survival:Cox
# SURVIVAL COX NEEDS THIS TARGET (TO DIGEST EFS AND EFS_TIME)
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

In [34]:
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    'vent_hist',
    'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    'hla_match_c_low',
    'rituximab',
    'hla_match_drb1_low',
    'hla_match_dqb1_low',
    'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    'pulm_moderate',
    'hla_low_res_10',
    'PC_1',
    'PC_2',
    'PC_3',
    'PC_4',
    'PC_5',
    'PC_6',
    'PC_7',
    'PC_8',
    'PC_9',
    'PC_10',
    'PC_11',
    'PC_12',
    'PC_13',
    'PC_14',
    'PC_15'
    ]

In [None]:
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb_cox = np.zeros(len(train))
pred_xgb_cox = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_xgb_cox = XGBRegressor(
        device="cuda",
        max_depth=3,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.02,  
        enable_categorical=True,
        min_child_weight=80,
        objective='survival:cox',
        eval_metric='cox-nloglik',
    )
    model_xgb_cox.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],  
        verbose=500  
    )
    
    # INFER OOF
    oof_xgb_cox[test_index] = model_xgb_cox.predict(x_valid)
    # INFER TEST
    pred_xgb_cox += model_xgb_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_cox /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_cox
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Survival:Cox =",m)

In [None]:
feature_importance = model_xgb_cox.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  # Replace FEATURES with your list of feature names
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(15, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost Survival:Cox Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.savefig(f'{output_path}xgboost_cox_feature_importance.png')
plt.show()

In [38]:
# CatBoost with Survival:Cox
# feature selection
FEATURES = ['dri_score',
    'psych_disturb',
    'cyto_score',
    'diabetes',
    'hla_match_c_high',
    'hla_high_res_8',
    'tbi_status',
    'arrhythmia',
    'hla_low_res_6',
    'graft_type',
    'vent_hist',
    'renal_issue',
    'pulm_severe',
    'prim_disease_hct',
    'hla_high_res_6',
    'cmv_status',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'tce_imm_match',
    'hla_nmdp_6',
    'hla_match_c_low',
    'rituximab',
    'hla_match_drb1_low',
    'hla_match_dqb1_low',
    'prod_type',
    'cyto_score_detail',
    'conditioning_intensity',
    'ethnicity',
    'year_hct',
    'obesity',
    'mrd_hct',
    'in_vivo_tcd',
    'tce_match',
    'hla_match_a_high',
    'hepatic_severe',
    'donor_age',
    'prior_tumor',
    'hla_match_b_low',
    'peptic_ulcer',
    'age_at_hct',
    'hla_match_a_low',
    'gvhd_proph',
    'rheum_issue',
    'sex_match',
    'hla_match_b_high',
    'race_group',
    'comorbidity_score',
    'karnofsky_score',
    'hepatic_mild',
    'tce_div_match',
    'donor_related',
    'melphalan_dose',
    'hla_low_res_8',
    'cardiac',
    'hla_match_drb1_high',
    'pulm_moderate',
    'hla_low_res_10',
    'PC_1',
    'PC_2',
    'PC_3',
    'PC_4',
    'PC_5',
    'PC_6',
    'PC_7',
    'PC_8',
    'PC_9',
    'PC_10',
    'PC_11',
    'PC_12',
    'PC_13',
    'PC_14',
    'PC_15'
    ]

In [None]:

FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_cat_cox = np.zeros(len(train))
pred_cat_cox = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",   
        iterations=400,     
        learning_rate=0.1,  
        grow_policy='Lossguide',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)
    
    # INFER OOF
    oof_cat_cox[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox =",m)

In [None]:
feature_importance = model_cat_cox.get_feature_importance()
importance_df = pd.DataFrame({
    "Feature": FEATURES, 
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(15, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("CatBoost Survival:Cox Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.savefig(f'{output_path}catboost_cox_feature_importance.png')
plt.show()

In [None]:
# Ensemble CAT and XGB and LGB
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb) + rankdata(oof_cat) + rankdata(oof_lgb)\
                     + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

In [None]:
sub = pd.read_csv("input/data/sample_submission.csv")
sub.prediction = rankdata(pred_xgb) + rankdata(pred_cat) + rankdata(pred_lgb)\
                     + rankdata(pred_xgb_cox) + rankdata(pred_cat_cox)
sub.to_csv("submission.csv",index=False)
print("Sub shape:",sub.shape)
sub.head()