In [255]:
#!pip download lifelines
#%pip install input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
#%pip install input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
#%pip install input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
#%pip install input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
#%pip install input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

In [256]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from scipy.stats import rankdata 
import numpy as np
from tqdm import tqdm

import lightgbm as lgb
from lightgbm import LGBMRegressor

from scipy.stats import rankdata 
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from lifelines import KaplanMeierFitter, NelsonAalenFitter
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb

from metric import score
import optuna
from typing import Dict, Tuple, Optional, List
import json

from itertools import combinations_with_replacement

In [None]:
# set analysis output directory
def create_output_directory(output_path):
    """Create the output directory if it doesn't exist and set plotting style."""
    os.makedirs(output_path, exist_ok=True)
    return output_path

output_path = 'working/analysis'
create_output_directory(output_path)

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

test = pd.read_csv("input/data/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("input/data/train.csv")
print("Train shape:",train.shape)
train.head()

In [None]:
# Train targets
plt.hist(train.loc[train.efs==1,"efs_time"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"efs_time"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Time of Observation, efs_time")
plt.ylabel("Density")
plt.title("Times of Observation. Either time to event, or time observed without event.")
plt.legend()
plt.savefig(f'{output_path}/times_of_observation.png')
plt.show()

In [None]:
# missing values heatmap
def plot_missing_values_heatmap(df, output_path):
    plt.figure(figsize=(16, 8))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.tight_layout()
    plt.savefig(f'{output_path}/missing_values_heatmap.png')
    plt.show()
    plt.close()

plot_missing_values_heatmap(train, output_path)

In [None]:
# missing value percentages
def plot_missing_values_bars(df, output_path):
    plt.figure(figsize=(16, 10))
    missing_percentages = (df.isnull().sum() / len(df) * 100).sort_values(ascending=True)
    sns.barplot(x=missing_percentages.values, y=missing_percentages.index)
    plt.title('Percentage of Missing Values by Column')
    plt.xlabel('Percentage Missing')
    plt.tight_layout()
    plt.savefig(f'{output_path}/missing_values_percentage.png')
    plt.show()
    plt.close()

plot_missing_values_bars(train, output_path)

In [None]:
# categorical distributions
def plot_categorical_distributions(df, output_path):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    for col in tqdm(categorical_cols, desc="Creating categorical plots"):
        if df[col].nunique() < 30:
            plt.figure(figsize=(12, 6))
            value_counts = df[col].value_counts()
            sns.barplot(x=value_counts.index, y=value_counts.values)
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig(f'{output_path}/categorical_{col}.png')
            plt.close()

plot_categorical_distributions(train, output_path)

In [None]:
FEATURES = train.columns
print(f"Number of Features: {len(FEATURES)} FEATURES: {FEATURES}")

In [None]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("Missing")
        test[c] = test[c].fillna("Missing")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In [None]:
print("Train DataFrame duplicate columns:", train.columns[train.columns.duplicated()].tolist())
print("Test DataFrame duplicate columns:", test.columns[test.columns.duplicated()].tolist())

In [None]:
# Check for inf values in the dataframe
inf_columns = train.columns[train.isin([np.inf, -np.inf, np.nan]).any()].tolist()

print("Columns containing infinite or NaN values:")
if inf_columns:
    for col in inf_columns:
        print(f"- {col}: {train[col].isin([np.inf, -np.inf, np.nan]).sum()} infinite or NaN values")
else:
    print("No infinite or NaN values found in any column")

In [None]:
inf_columns

In [None]:
# fix columns with NaN values
columns_to_fix = [
            'hla_match_c_high',
            'hla_high_res_8',
            'hla_low_res_6',
            'hla_high_res_6',
            'hla_high_res_10',
            'hla_match_dqb1_high',
            'hla_nmdp_6',
            'hla_match_c_low',
            'hla_match_drb1_low',
            'hla_match_dqb1_low',
            'hla_match_a_high',
            'donor_age',
            'hla_match_b_low',
            'hla_match_a_low',
            'hla_match_b_high',
            'comorbidity_score',
            'karnofsky_score',
            'hla_low_res_8',
            'hla_match_drb1_high',
            'hla_low_res_10',
            'age_at_hct'
]

def clean_nan_inf(df, columns_to_fix):
    for c in columns_to_fix:
        max_value = df[c].max()
        min_value = df[c].min()
        df[c] = df[c].replace(['', None], np.nan).fillna(0)
        df[c] = df[c].replace([np.inf]).fillna(max_value)
        df[c] = df[c].replace([-np.inf]).fillna(min_value)
    return df

train = clean_nan_inf(train, columns_to_fix)
test = clean_nan_inf(test, columns_to_fix)

In [None]:
# numerical distributions
def plot_numerical_distributions(df, output_path):
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
    
    # Create progress bar for numerical distributions
    for col in tqdm(numerical_cols, desc="Creating distribution plots"):
        plt.figure(figsize=(10, 6))
        
        # Create subplot with histogram and kde
        sns.histplot(data=df, x=col, kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        
        # Add statistical annotations
        stats_text = f'Mean: {df[col].mean():.2f}\n'
        stats_text += f'Median: {df[col].median():.2f}\n'
        stats_text += f'Std: {df[col].std():.2f}'
        plt.text(0.95, 0.95, stats_text,
                transform=plt.gca().transAxes,
                verticalalignment='top',
                horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        plt.tight_layout()
        plt.savefig(f'{output_path}/distribution_{col}.png')
        plt.close()

plot_numerical_distributions(train, output_path)

In [None]:
# correlation matrix
def plot_correlation_matrix(df, output_path):
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
    
    if len(numerical_cols) > 1:
        plt.figure(figsize=(24, 16))
        correlation_matrix = df[numerical_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Matrix')
        plt.tight_layout()
        plt.savefig(f'{output_path}/correlation_matrix.png')
        plt.show()
        plt.close()

plot_correlation_matrix(train, output_path)

In [271]:
# Feature Engineering
def feature_engineering(df):   
    df = df.copy()
    # replace NaN with median values for numerical columns
    median_comorbidity = df['comorbidity_score'].median()
    df['comorbidity_score'] = df['comorbidity_score'].replace([np.inf, -np.inf, '', None], np.nan).fillna(median_comorbidity)
    df['karnofsky_score'] = df['karnofsky_score'].replace([np.inf, -np.inf, '', None], np.nan).fillna(0)
    median_donor_age = df['donor_age'].median()
    df['donor_age'] = df['donor_age'].replace([np.inf, -np.inf, '', None], np.nan).fillna(median_donor_age)
    median_age = df['age_at_hct'].median()
    df['age_at_hct'] = df['age_at_hct'].replace([np.inf, -np.inf, '', None], np.nan).fillna(median_age)
    
    # Create age bins in 5-year intervals
    df['age_bin'] = pd.cut(df['age_at_hct'], 
                          bins=range(0, 95, 5),  # From 0 to 75 in steps of 5
                          labels=[f'{i}-{i+4}' for i in range(0, 90, 5)],
                          include_lowest=True)
    
    # commorbity by age at hct
    df['comorbidity_age'] = df['comorbidity_score'] / (df['age_at_hct'])
    
    df['age_bin_race'] = (df['age_bin'].astype(str) + '_' + 
                      df['race_group'].astype(str)).astype('category')

    # age x cyto
    df['cyto_age'] = (df['cyto_score_detail'].astype(str) + '_' + df['age_bin'].astype(str)).astype('category')

    # Concatenate graft_type and prod_type
    df['graft_prod'] = (df['graft_type'].astype(str) + '_' + 
                    df['prod_type'].astype(str)).astype('category')

    # Concatenate graft_type and prim_disease_hct
    df['graft_prim_disease'] = (df['graft_type'].astype(str) + '_' + 
                    df['prim_disease_hct'].astype(str)).astype('category')

    df['graft_cmv_status'] = (df['graft_type'].astype(str) + '_' + 
                    df['cmv_status'].astype(str)).astype('category')
    
    # Concatenate age bin and pulm severe
    df['age_bin_pulm_severe'] = (df['age_bin'].astype(str) + '_' + 
                    df['pulm_severe'].astype(str)).astype('category')
    
    # Concatenate age bin, race_group, and dri score
    df['age_bin_dri'] = (df['age_bin'].astype(str) + '_' + 
                    df['dri_score'].astype(str)).astype('category')

    # hla high mean
    df['hla_high_res_mean'] = df[['hla_high_res_8', 'hla_high_res_10', 'hla_high_res_6']].mean(axis=1)

    # hla low mean
    df['hla_low_res_mean'] = df[['hla_low_res_8', 'hla_low_res_10', 'hla_low_res_6']].mean(axis=1)

    # ration of hla high and hla low
    df['hla_ratio_res_highlow'] = (df['hla_high_res_mean'])/(df['hla_low_res_mean']+1)

    # ration of hla low and hla high
    df['hla_ratio_res_lowhigh'] = (df['hla_low_res_mean'])/(df['hla_high_res_mean']+1)

    # age functions
    df['donor_by_age_at_hct'] = (df['donor_age']/df['age_at_hct'])
    df['comorbidity_score_by_age_at_hct'] = (df['comorbidity_score']/df['age_at_hct'])
    
    # match drb
    df['hla_match_drb1_mean'] = df[['hla_match_drb1_high', 'hla_match_drb1_low']].mean(axis=1)

    # match dqb
    df['hla_match_dqb1_mean'] = df[['hla_match_dqb1_high', 'hla_match_dqb1_low']].mean(axis=1)

    # additional ratios
    df['hla_high_low_ratio'] = (df['hla_high_res_mean']) / (df['hla_low_res_mean'] + 1)
    df['drb1_dqb1_ratio'] = (df['hla_match_drb1_mean']) / (df['hla_match_dqb1_mean'] + 1)

    # difference in features
    df['high_low_diff'] = df['hla_high_res_mean'] - df['hla_low_res_mean']
    df['drb1_dqb1_diff'] = df['hla_match_drb1_mean'] - df['hla_match_dqb1_mean']

    # statistical aggregations
    df['hla_mean'] = df[['hla_high_res_mean', 'hla_low_res_mean', 'hla_match_drb1_mean', 'hla_match_dqb1_mean']].mean(axis=1)
    df['hla_std'] = df[['hla_high_res_mean', 'hla_low_res_mean', 'hla_match_drb1_mean', 'hla_match_dqb1_mean']].std(axis=1)
    df['hla_max'] = df[['hla_high_res_mean', 'hla_low_res_mean', 'hla_match_drb1_mean', 'hla_match_dqb1_mean']].max(axis=1)
    df['hla_min'] = df[['hla_high_res_mean', 'hla_low_res_mean', 'hla_match_drb1_mean', 'hla_match_dqb1_mean']].min(axis=1)

    # interaction terms
    df['drb1_high_interaction'] = df['hla_match_drb1_mean'] * df['hla_high_res_mean']
    df['dqb1_low_interaction'] = df['hla_match_dqb1_mean'] * df['hla_low_res_mean']

    # with or without TBI
    df['with_tbi'] = np.where((df['tbi_status']).astype(str) == 'No TBI', 'no', 'yes')

    # Create donor sex feature
    df['sex_donor'] = (df['sex_match']).astype(str).str[0]

    # Create recipient sex feature
    df['sex_recipient'] = (df['sex_match']).astype(str).str[2]

    # Main drug presence
    df['has_FK'] = (df['gvhd_proph']).astype(str).str.contains('FK', na=False).astype(int)
    df['has_MMF'] = (df['gvhd_proph']).astype(str).str.contains('MMF', na=False).astype(int)
    df['has_MTX'] = (df['gvhd_proph']).astype(str).str.contains('MTX', na=False).astype(int)
    df['has_CSA'] = (df['gvhd_proph']).astype(str).str.contains('CSA', na=False).astype(int)
    df['has_cyclophosphamide'] = (df['gvhd_proph']).astype(str).str.contains('Cyclophosphamide', na=False).astype(int)

    # Check for combination therapy
    df['has_combination'] = (df['gvhd_proph']).astype(str).str.contains('\+', na=False).astype(int)

    # Count number of agents (approximate by counting '+' signs)
    df['n_agents'] = (df['gvhd_proph']).astype(str).str.count('\+').add(1)

    # Complex vs Simple regimen
    df['is_complex'] = (df['gvhd_proph']).astype(str).str.contains('others', na=False).astype(int)

    # Depletion-based therapy
    df['is_depletion_based'] = (df['gvhd_proph']).astype(str).str.contains('TDEPLETION|CDselect', na=False).astype(int)

    # Monotherapy flag
    df['is_monotherapy'] = (df['gvhd_proph']).astype(str).str.contains('alone', na=False).astype(int)

    # No prophylaxis flag
    df['no_prophylaxis'] = ((df['gvhd_proph']).astype(str) == 'No GvHD Prophylaxis').astype(int)

    def get_primary_agent(x):
        if pd.isna(x):
            return 'Unknown'
        elif 'FK' in x:
            return 'FK-based'
        elif 'CSA' in x:
            return 'CSA-based'
        elif 'Cyclophosphamide' in x:
            return 'Cyclophosphamide-based'
        elif 'TDEPLETION' in x or 'CDselect' in x:
            return 'Depletion-based'
        else:
            return 'Other'

    df['primary_agent'] = (df['gvhd_proph']).astype(str).apply(get_primary_agent)

    # Standard vs Alternative approach
    df['is_standard_approach'] = (df['gvhd_proph']).astype(str).str.contains('FK\+ MMF|FK\+ MTX|CSA \+ MTX', na=False).astype(int)

    # Experimental/Other
    df['is_experimental'] = ((df['gvhd_proph']).astype(str) == 'Other GVHD Prophylaxis').astype(int)

    # Combine with other relevant features
    df['FK_MMF_interaction'] = df['has_FK'] * df['has_MMF']
    #df['CSA_MTX_interaction'] = df['has_CSA'] * df['has_MTX']

    # karnofsky score
    df['karnofsky_age_at_hct'] = df['karnofsky_score'] * df['age_at_hct']
    df['karnofsky_standardized'] = (df['karnofsky_score'] - df['karnofsky_score'].mean()) / df['karnofsky_score'].std()
    df['graft_prod'] = df['graft_prod'].astype('category')

    # Create performance categories
    df['karnofsky_category'] = pd.cut(
        df['karnofsky_score'],
        bins=[0, 60, 80, 90, 100],
        labels=['poor', 'moderate', 'good', 'excellent']
    )

    # Clinical thresholds
    df['karnofsky_below_70'] = (df['karnofsky_score'] < 70).astype(int)
    df['karnofsky_above_80'] = (df['karnofsky_score'] > 80).astype(int)

    # Deviation from median
    median_karnofsky = df['karnofsky_score'].median()
    df['karnofsky_deviation'] = df['karnofsky_score'] - median_karnofsky

    # vivo
    df['in_vivo_tcd'].fillna("Missing")
    df['prim_disease_hct'].fillna("Missing")
    df['vivo_age_bin'] = (df['age_bin'].astype(str) + '_' + 
                    df['in_vivo_tcd'].astype(str)).astype('category')
    df['vivo_comorbidity'] = (df['comorbidity_score'].astype(str) + '_' + 
                    df['in_vivo_tcd'].astype(str)).astype('category')
    df['vivo_prim_disease'] = (df['prim_disease_hct'].astype(str) + '_' + 
                    df['in_vivo_tcd'].astype(str)).astype('category') 
    
    # dri
    # Map risk levels to numeric scores
    risk_map = {
        'Low': 1,
        'Intermediate': 2,
        'High': 3,
        'Very high': 4,
        'N/A - non-malignant indication': -1,
        'N/A - pediatric': -2,
        'N/A - disease not classifiable': -3,
        'TBD cytogenetics': -4,
        'Missing disease status': 0
    }
    
    # Create numeric DRI score
    df['dri_numeric'] = df['dri_score'].map(risk_map)

    # Fill missing values with median
    df['dri_numeric'] = df['dri_numeric'].fillna(0)

    # Create binary features for risk levels
    df['is_high_risk'] = df['dri_score'].isin(['High', 'Very high']).astype(int)
    df['is_standard_risk'] = df['dri_score'].isin(['Low', 'Intermediate']).astype(int)
    # df['is_special_case'] = df['dri_score'].str.contains('N/A|TBD|Missing', na=False).astype(int)

    # Combine with other relevant features
    df['dri_age'] = df['dri_numeric'] * df['age_at_hct']
    df['dri_comorbidity'] = df['dri_numeric'] * df['comorbidity_score']
    df['dri_karnofsky'] = df['dri_numeric'] / (df['karnofsky_score'] + 1)

    # Risk-weighted karnofsky performance
    df['karnofsky_weighted_performance'] = (
        df['karnofsky_score'] * 
        (1 - df['dri_age'].clip(0, 1))  # Clip DRI age to [0,1] range
    )

    # Create risk-adjusted Karnofsky
    df['karnofsky_risk_adjusted'] = df['karnofsky_score'] * (1 + df['dri_age'])
    
    # Create categorical combinations
    df['dri_disease_status'] = (df['dri_score'].astype(str) + '_' + 
                               df['tbi_status'].astype(str)).astype('category')
    
    # Ethnicity
    df['ethnicity'].fillna("Missing")
    df['age_bin_ethnicity'] = (df['age_bin'].astype(str) + '_' + 
                    df['ethnicity'].astype(str)).astype('category')
    df['dri_ethnicity'] = (df['dri_score'].astype(str) + '_' + 
                    df['ethnicity'].astype(str)).astype('category')
    df['ethnicity_vivo'] = (df['ethnicity'].astype(str) + '_' + 
                    df['in_vivo_tcd'].astype(str)).astype('category')

    # Calculate HLA mismatch score
    hla_match_cols = [col for col in df.columns if 'hla_match' in col and ('high' in col or 'low' in col)]
    df['hla_mismatch_score'] = len(hla_match_cols) - df[hla_match_cols].sum(axis=1)
    
    # HLA Match Score Aggregations
    hla_cols = [col for col in df.columns if 'hla_match' in col]
    df['hla_match_total'] = df[hla_cols].sum(axis=1)
    
    # Calculate weighted HLA match score with available columns
    major_matches = df[['hla_match_drb1_high']].fillna(0) * 2  # Major histocompatibility
    minor_matches = df[['hla_match_dqb1_high']].fillna(0)      # Minor histocompatibility
    df['hla_match_weighted'] = major_matches.sum(axis=1) + minor_matches.sum(axis=1)

    # Time-based Features
    df['transplant_era'] = pd.cut(df['year_hct'], 
                                 bins=[2007, 2010, 2013, 2016, 2020],
                                 labels=['2008-2010', '2011-2013', '2014-2016', '2017-2019'])

    # Calculate relative time features
    df['years_since_2008'] = df['year_hct'] - 2008
    df['pre_post_2015'] = (df['year_hct'] > 2015).astype(int)  # Major increase in cases after 2015

    # Complex Disease Indicators - handling binary values correctly
    df['multiple_conditions'] = ((df['cardiac'] == 'Yes').astype(int) + 
                               (df['pulm_moderate'] == 'Yes').astype(int) + 
                               (df['hepatic_mild'] == 'Yes').astype(int) + 
                               (df['renal_issue'] == 'Yes').astype(int))
    
    # Create severity score based on condition combinations
    df['condition_severity'] = df.apply(lambda x: 
        3 if x['multiple_conditions'] >= 3
        else 2 if x['multiple_conditions'] == 2 
        else 1 if x['multiple_conditions'] == 1
        else 0, axis=1)

     # Interaction Features
    df['age_comorbidity_interaction'] = df['age_at_hct'] * df['comorbidity_score']
    df['karnofsky_hla_interaction'] = df['karnofsky_score'] * df['hla_match_total']

    # Risk Stratification
    df['high_risk_combination'] = ((df['is_high_risk'] == 1) & 
                                 (df['comorbidity_score'] > df['comorbidity_score'].median()) & 
                                 (df['age_at_hct'] > df['age_at_hct'].median())).astype(int)
    
    # Treatment Intensity Score
    df['treatment_intensity'] = (df['conditioning_intensity'].map({'MAC': 2, 'RIC': 1, 'NMA': 0}) + 
                               df['has_combination'] * 0.5 + 
                               df['is_complex'] * 0.5).fillna(0)
    
    # Donor-Recipient Compatibility Score
    df['compatibility_score'] = (df['hla_match_total'] * 0.4 + 
                               (df['sex_match'] == 'M-M').astype(int) * 0.3 +
                               (df['donor_age'] < 30).astype(int) * 0.3).fillna(0)
    
    # Clinical Status Composite
    df['clinical_status_score'] = ((1 - df['karnofsky_score'] / 100) * 0.4 +
                             (df['comorbidity_score'] / df['comorbidity_score'].max()) * 0.3 +
                             df['is_high_risk'] * 0.3)
    
    # Disease Control Features
    df['disease_control_factor'] = df.apply(lambda x: 
        1 if x['dri_score'] in ['Low', 'Intermediate'] and x['cmv_status'] == 'negative' 
        else 0.5 if x['dri_score'] == 'High' 
        else 0, axis=1)
    
    # Immunological Burden Score
    df['immune_burden'] = (df['hla_mismatch_score'] * 0.4 +
                          df['has_combination'] * 0.3 +
                          (df['cmv_status'] == 'positive').astype(int) * 0.3)
    
    # create quantile of dri for polynomials
    df['dri_quantile'] = pd.qcut(df['dri_numeric'], q=5, labels=False, duplicates='drop') / 4

    # create numeric conditioning intensity
    # Map levels
    intensity_map = {
        'RIC': 3,
        'NMA': 2,
        'MAC': 5,
        'TBD': -1,
        'No drugs reported': 1,
        'N/A, F(pre-TED) not submitted': -2,
        'Missing': 0
    }
    
    # Create numeric conditioning intensity score
    df['conditioning_intensity_numeric'] = df['conditioning_intensity'].map(intensity_map)

    # Create polynomial features for key numeric variables
    numeric_cols = ['age_at_hct', 
                    'comorbidity_score', 
                    'karnofsky_score', 
                    'donor_age', 
                    'dri_numeric',
                    'compatibility_score',
                    'treatment_intensity',
                    'dri_age',
                    'dri_comorbidity',
                    'disease_control_factor',
                    'clinical_status_score',
                    'dri_quantile',
                    'conditioning_intensity_numeric'
                   ]
    
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(df[numeric_cols])
    
    # Create meaningful feature names
    feature_names = []
    
    # First add original features
    feature_names.extend(numeric_cols)
    
    # Add squared terms and interactions
    for combo in combinations_with_replacement(numeric_cols, 2):
        if combo[0] == combo[1]:
            feature_names.append(f'{combo[0]}_squared')
        else:
            feature_names.append(f'{combo[0]}_x_{combo[1]}')
    
    # Create DataFrame with meaningful names
    poly_df = pd.DataFrame(poly_features, columns=feature_names, index=df.index)
    
    # Remove original columns since they already exist in df
    poly_df = poly_df.drop(columns=numeric_cols)
    
    # Concatenate with original dataframe
    df = pd.concat([df, poly_df], axis=1)

    df['age_bin_high_risk'] = (df['age_bin'].astype(str) + '_' + 
                            df['is_high_risk'].astype(str)).astype('category')
    
    df['age_bin_race_high_risk'] = (df['age_bin_race'].astype(str) + '_' + 
                            df['is_high_risk'].astype(str)).astype('category')

    # Bin donor age in 5-year intervals
    df['donor_age_bin'] = pd.cut(df['donor_age'], 
                          bins=range(0, 95, 5),  # From 0 to 75 in steps of 5
                          labels=[f'{i}-{i+4}' for i in range(0, 90, 5)],
                          include_lowest=True)

    df['donor_age_bin_related'] = (df['donor_age_bin'].astype(str) + '_' + 
                            df['donor_related'].astype(str)).astype('category')

    df['age_bin_to_donor_age_bin'] = (df['age_bin'].astype(str) + '_' + 
                            df['donor_age_bin'].astype(str)).astype('category')

    # depletion based features
    # Age interactions since age is a key factor
    df['depletion_age_interaction'] = df['is_depletion_based'] * df['age_at_hct']

    # Risk-related interactions
    df['depletion_risk_interaction'] = df['is_depletion_based'] * df['dri_numeric']

    # Comorbidity interaction
    df['depletion_comorbidity_interaction'] = df['is_depletion_based'] * df['comorbidity_score']

    # Combine multiple risk factors
    df['combined_risk_score'] = (
        (df['dri_numeric'] + 
        df['comorbidity_score']) * 
        (df['age_at_hct']/100)  # Scaled to be on similar range
    )
    
    # Risk category with depletion consideration
    df['risk_depletion_category'] = np.where(
        df['is_depletion_based'] == 1,
        df['combined_risk_score'] * 0.8,  # Adjust risk score based on depletion status
        df['combined_risk_score']
    )

    # Age groups with depletion consideration
    df['age_risk_group'] = pd.qcut(df['age_at_hct'], q=5, labels=['VL', 'L', 'M', 'H', 'VH'])
    
    # Age-depletion risk matrix
    df['age_depletion_risk'] = df.apply(
        lambda x: f"age_{x['age_risk_group']}_depl_{x['is_depletion_based']}", 
        axis=1
    )

    # weighted risk score
    weights = {
            'dri_numeric': 0.4,
            'comorbidity_score': 0.3,
            'age_at_hct': 0.3
        }
    
    df['weighted_risk_score'] = (
        (weights['dri_numeric'] * df['dri_numeric']) +
        (weights['comorbidity_score'] * df['comorbidity_score']) +
        (weights['age_at_hct'] * (df['age_at_hct'] / 100))  # Scale age to be comparable
    )

    # Create quantile ranks for each component
    df['comorbidity_quantile'] = pd.qcut(df['comorbidity_score'], q=5, labels=False, duplicates='drop') / 4
    df['age_quantile'] = pd.qcut(df['age_at_hct'], q=5, labels=False, duplicates='drop') / 4
    
    # Combine using different weighting schemes
    df['risk_score_equal'] = (
        df['dri_quantile'] +
        df['comorbidity_quantile'] +
        df['age_quantile']
    ) / 3
    
    df['risk_score_dri_weighted'] = (
        0.5 * df['dri_quantile'] +
        0.25 * df['comorbidity_quantile'] +
        0.25 * df['age_quantile']
    )

    # additional graft features
    df['graft_prod'] = (df['graft_type'].astype(str) + '_' + 
                    df['age_bin'].astype(str)).astype('category')

    df['graft_dri'] = (df['graft_type'].astype(str) + '_' + 
                    df['dri_score'].astype(str)).astype('category')
    
    # Rituximab given with specific GVHD prophylaxis combinations
    df['rituximab'] = df['rituximab'].fillna('No')
    df['gvhd_proph'] = df['gvhd_proph'].fillna('Missing')
    df['rituximab_given_gvhd'] = np.where(
                                df['rituximab'] == 'Yes',
                                df['gvhd_proph'].astype(str) + '_with_rituximab',
                                df['gvhd_proph'].astype(str) + '_no_rituximab'
                            )
    df['rituximab_given_gvhd'] = df['rituximab_given_gvhd'].astype('category')

    # cmv status split between donor and recipient
    df['cmv_status'] = df['cmv_status'].fillna('Missing')
    df['cmv_donor'] = df['cmv_status'].str.split('/').str[0].replace({'': 'Missing'})
    df['cmv_recipient'] = df['cmv_status'].str.split('/').str[1].replace({'': 'Missing'})

    # cmv interactions
    df['cmv_status_gvhd_proph'] = (df['cmv_status'].astype(str) + '_' + 
                    df['gvhd_proph'].astype(str)).astype('category')
    df['cmv_status_conditioning_intensity'] = (df['cmv_status'].astype(str) + '_' + 
                    df['conditioning_intensity'].astype(str)).astype('category')
    
    # conditioning intensity interactions to try and boost the catboost km model
    df['conditioning_intensity_prim_disease'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['prim_disease_hct'].astype(str)).astype('category')
    df['conditioning_intensity_age_bin'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['age_bin'].astype(str)).astype('category')
    df['conditioning_intensity_cyto'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['cyto_score'].astype(str)).astype('category')
    df['conditioning_intensity_karnofsky'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['karnofsky_score'].astype(str)).astype('category')
    df['conditioning_intensity_donor_age_bin'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['donor_age_bin'].astype(str)).astype('category')
    df['conditioning_intensity_race_group'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['race_group'].astype(str)).astype('category')
    df['conditioning_intensity_gvhd_proph'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['gvhd_proph'].astype(str)).astype('category')
    df['conditioning_intensity_rituximab'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['rituximab'].astype(str)).astype('category')
    df['conditioning_intensity_mrd_hct'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['mrd_hct'].astype(str)).astype('category')
    df['conditioning_intensity_year'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['year_hct'].astype(str)).astype('category')
    df['conditioning_intensity_tbi'] = (df['conditioning_intensity'].astype(str) + '_' + 
                    df['tbi_status'].astype(str)).astype('category')

    # Clinical Status Composite with conditioning intensity numeric
    df['clinical_status_score_intensity'] = ((1 - df['karnofsky_score'] / 100) * 0.3 +
                             (df['comorbidity_score'] / df['comorbidity_score'].max()) * 0.2 +
                             (df['is_high_risk'] * 0.2) + (df['conditioning_intensity_numeric'] * 0.3))
    
    return df

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

In [None]:
train['dri_quantile']

In [None]:
train['clinical_status_score']

In [274]:
# Feature Engineering
def advanced_fe(df):   
    # polynomial functions
    # dri
    df['dri_comorbidity_fn'] = df['dri_comorbidity_squared'] + df['dri_numeric_x_dri_comorbidity']
    df['dri_num_comorbidity_fn'] = df['dri_numeric_squared'] + df['dri_numeric_x_dri_comorbidity']
    df['dri_num_compatibility_fn'] = df['dri_numeric_squared'] + df['dri_numeric_x_compatibility_score']

    # compatibility
    df['compatibility_dri_age_fn'] = df['compatibility_score_squared'] + df['compatibility_score_x_dri_age']
    df['compatibility_dri_comorbidity_fn'] = df['compatibility_score_squared'] + df['compatibility_score_x_dri_comorbidity']
    df['compatibility_treatment_intensity_fn'] = df['compatibility_score_squared'] + df['compatibility_score_x_treatment_intensity']

    # comorbidity
    df['comorbidity_compatibility_fn'] = df['comorbidity_score_squared'] + df['comorbidity_score_x_compatibility_score']
    df['comorbidity_dri_age_fn'] = df['comorbidity_score_squared'] + df['comorbidity_score_x_dri_age']
    df['comorbidity_dri_comorbidity_fn'] = df['comorbidity_score_squared'] + df['comorbidity_score_x_dri_comorbidity']
    df['comorbidity_dri_numeric_fn'] = df['comorbidity_score_squared'] + df['comorbidity_score_x_dri_numeric']
    df['comorbidity_karnofsky_fn'] = df['comorbidity_score_squared'] + df['comorbidity_score_x_karnofsky_score']
    df['comorbidity_treatment_intensity_fn'] = df['comorbidity_score_x_treatment_intensity'] - df['comorbidity_score_squared']
    
    # age
    df['age_comorbidity_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_comorbidity_score']
    df['age_compatibility_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_compatibility_score']
    df['age_dri_age_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_dri_age']
    df['age_dri_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_dri_numeric']
    df['age_karnofsky_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_karnofsky_score']
    df['age_treatment_intensity_fn'] = df['age_at_hct_squared'] + df['age_at_hct_x_treatment_intensity']

    # karnofsky
    df['karnofsky_compatibility_fn'] = df['karnofsky_score_squared'] + df['karnofsky_score_x_compatibility_score']
    df['karnofsky_dri_age_fn'] = df['karnofsky_score_squared'] + df['karnofsky_score_x_dri_age']
    df['karnofsky_dri_comorbidity_fn'] = df['karnofsky_score_squared'] + df['karnofsky_score_x_dri_comorbidity']
    df['karnofsky_dri_fn'] = df['karnofsky_score_squared'] + df['karnofsky_score_x_dri_numeric']
    df['karnofsky_treatment_intensity_fn'] = df['karnofsky_score_squared'] + df['karnofsky_score_x_treatment_intensity']

    # treatment intensity
    df['treatment_intensity_dri_age_fn'] = df['treatment_intensity_x_dri_age'] + df['treatment_intensity_squared']
    df['treatment_intensity_dri_comorbidity_fn'] = df['treatment_intensity_x_dri_comorbidity'] + df['treatment_intensity_squared']
    
    return df

train = advanced_fe(train)
train = train.dropna(thresh=10, axis=0)
test = advanced_fe(test)

In [None]:
train['dri_comorbidity_fn']

In [276]:
FEATURES = train.columns

In [277]:
# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [279]:
for col in CATS:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

for col in CATS:
    # Ensure categories are coded as integers starting from 0
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
    # Verify the encoding
    #print(f"\nAfter fixing {col}:")
    #print(train[col].value_counts())

In [None]:
# Transform Two Targets into One Target with KaplanMeier
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y_km"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

plt.hist(train.loc[train.efs==1,"y_km"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y_km"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y_km")
plt.ylabel("Density")
plt.title("KaplanMeier Transformed Target y using both efs and efs_time.")
plt.legend()
plt.savefig(f'{output_path}/kaplanmeier_transformed_target_y.png')
plt.show()

In [None]:
# Cox fitter
def transform_survival_probability(df):
    df = df.copy()
    df['efs_time'] = pd.to_numeric(df['efs_time'], errors='coerce')
    df['efs'] = pd.to_numeric(df['efs'], errors='coerce')
    required_cols = ['efs_time', 'efs'] + CATS
    df = df.dropna(subset=['efs_time', 'efs'])
    cph = CoxPHFitter(penalizer=0.1)
    cph.fit(df[required_cols], duration_col='efs_time', event_col='efs')
    y = cph.predict_partial_hazard(df)
    return y
train["y_cox"] = transform_survival_probability(train)

plt.hist(train.loc[train.efs==1,"y_cox"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y_cox"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y_cox")
plt.ylabel("Density")
plt.title("Cox Transformed Target y_cox using both efs and efs_time.")
plt.legend()
plt.savefig(f'{output_path}/cox_transformed_target_y.png')
plt.show()

In [None]:
# NelsonAalenFitter
def transform_survival_probability(df):
    naf = NelsonAalenFitter()
    naf.fit(durations=df['efs_time'], event_observed=df['efs'])
    y = naf.cumulative_hazard_at_times(df['efs_time']).values
    y = y * -1
    return y
train["y_na"] = transform_survival_probability(train)

plt.hist(train.loc[train.efs==1,"y_km"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y_km"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y_na")
plt.ylabel("Density")
plt.title("Nelson Aalen Transformed Target y using both efs and efs_time.")
plt.legend()
plt.savefig(f'{output_path}/nelsonaalen_transformed_target_y.png')
plt.show()

In [283]:
def recode_dri_score(df):
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Create binary feature for strong predictor
    df['dri_score_is_2'] = (df['dri_score'] == 2).astype(int)
    
    # Group rare categories
    conditions = [
        (df['dri_score'].isin([11, 8, 10])),  # Rare categories
        (df['dri_score'].isin([5, 6, 0])),    # Strong negative correlation group
        (df['dri_score'] == 2),               # Strong positive correlation
        (df['dri_score'] == 1)                # Large group with moderate correlation
    ]
    choices = ['rare', 'high_risk', 'optimal', 'baseline']
    
    df['dri_score_grouped'] = np.select(conditions, choices, default='other')
    df['dri_score_grouped'] = (df['dri_score_grouped']).astype('category')

    # graft and dri_score 2 interaction using the highly correlated value of '1'
    df['graft_dri_score_is_2'] = (df['graft_type'] == 1).astype(int) + df['dri_score_is_2']

    # graft interaction with dri weighted risk score
    df['graft_risk_interaction'] = (df['graft_type'] == 1).astype(int) * df['risk_score_dri_weighted']
    
    return df

train = recode_dri_score(train)
test = recode_dri_score(test)

In [None]:
train['graft_risk_interaction']

In [None]:
train['dri_score_grouped']

In [286]:
def recode_conditioning_intensity(df):
    df = df.copy()
    
    # Create main groups based on correlation patterns
    conditions = [
        (df['conditioning_intensity'] == 0),  # Strong negative
        (df['conditioning_intensity'] == 2),  # Strong positive
        (df['conditioning_intensity'].isin([3, 4])),  # Moderate positive
        (df['conditioning_intensity'].isin([5, 6])),  # Weak effects
        (df['conditioning_intensity'] == 1)   # Mixed effects
    ]
    
    choices = [
        'low_intensity',
        'optimal_intensity',
        'moderate_intensity',
        'high_intensity',
        'other'
    ]
    
    df['conditioning_group'] = pd.Categorical(np.select(conditions, choices, default='other'),
                            categories=['low_intensity',
                                        'optimal_intensity',
                                        'moderate_intensity',
                                        'high_intensity',
                                        'other'],
                            ordered=True
    )
    
    # Create binary flag for optimal intensity
    df['optimal_conditioning'] = (df['conditioning_intensity'] == 2).astype(int)
    
    return df

train = recode_conditioning_intensity(train)
test = recode_conditioning_intensity(test)

In [None]:
train['dri_disease_status']

In [288]:
def recode_dri_disease_status(df):
    df = df.copy()
    
    # Define our groups explicitly
    optimal_status = [2]  # Strong positive correlations
    favorable_status = [17, 3]  # Moderate positive correlations
    high_risk_status = [9, 12, 0]  # Strong negative correlations
    common_status = [6]  # Most frequent category
    rare_status = list(range(70, 83))  # High numbered rare categories
    
    # Create groups based on correlation patterns and frequency
    conditions = [
        (df['dri_disease_status'].isin(optimal_status)),
        (df['dri_disease_status'].isin(favorable_status)),
        (df['dri_disease_status'].isin(high_risk_status)),
        (df['dri_disease_status'].isin(common_status)),
        (df['dri_disease_status'].isin(rare_status))
    ]
    
    choices = [
        'optimal_status',
        'favorable_status',
        'high_risk_status',
        'common_status',
        'rare_status'
    ]
    
    # Create grouped variable
    df['disease_status_group'] = pd.Categorical(np.select(conditions, choices, default='other'),
                            categories=['optimal_status',
                                        'favorable_status',
                                        'high_risk_status',
                                        'common_status',
                                        'rare_status', 
                                        'other'],
                            ordered=True
    )
    
    
    # Create binary flag for optimal status
    df['optimal_disease_status'] = df['dri_disease_status'].isin(optimal_status).astype(int)
    
    # Create risk score based on correlations
    risk_weights = {
        2: 1.0,    # Strong positive
        17: 0.5,   # Moderate positive
        3: 0.5,    # Moderate positive
        9: -0.7,   # Strong negative
        12: -0.5,  # Moderate negative
        0: -0.5    # Moderate negative
    }
    
    df['disease_status_risk_score'] = df['dri_disease_status'].map(risk_weights).fillna(0)
    
    return df

train = recode_dri_disease_status(train)
test = recode_dri_disease_status(test)

In [None]:
train['disease_status_group']

In [None]:
train['age_bin_high_risk']

In [291]:
def recode_age_bin_high_risk(df):
    df = df.copy()
    
    # Define groups based on correlation patterns
    strong_negative = [14]  # Very strong negative (-0.355)
    strong_positive = [23, 20, 19]  # Strong positive (>0.15)
    moderate_positive = [17, 11, 12, 13]  # Moderate positive (0.09-0.15)
    moderate_negative = [0, 18, 6]  # Moderate negative (-0.11 to -0.17)
    weak_effect = [1, 7, 22, 28]  # Very weak correlations (<|0.02|)
    
    # Create groups
    conditions = [
        (df['age_bin_high_risk'].isin(strong_negative)),
        (df['age_bin_high_risk'].isin(strong_positive)),
        (df['age_bin_high_risk'].isin(moderate_positive)),
        (df['age_bin_high_risk'].isin(moderate_negative)),
        (df['age_bin_high_risk'].isin(weak_effect))
    ]
    
    choices = [
        'high_risk',
        'favorable',
        'moderate_favorable',
        'moderate_risk',
        'neutral'
    ]
    
    # Create grouped variable
    df['age_risk_group'] = pd.Categorical(
                            np.select(conditions, choices, default='other'),
                            categories=['high_risk', 
                                        'favorable', 
                                        'moderate_favorable', 
                                        'moderate_risk', 
                                        'neutral', 
                                        'other'],
                            ordered=True
    )
    
    # Create risk score based on correlation strengths
    risk_weights = {
        14: -0.355,  # Strong negative
        23: 0.189,   # Strong positive
        20: 0.186,   # Strong positive
        19: 0.154,   # Strong positive
        17: 0.134,   # Moderate positive
        11: 0.133,   # Moderate positive
        0: -0.165,   # Moderate negative
        18: -0.140,  # Moderate negative
        6: -0.115    # Moderate negative
    }
    
    # Create risk score
    df['age_bin_risk_score'] = df['age_bin_high_risk'].map(risk_weights).fillna(0)
    
    # Create binary flags for strongest predictors
    df['age_bin_high_risk_flag'] = df['age_bin_high_risk'].isin(strong_negative).astype(int)
    df['age_bin_favorable_flag'] = df['age_bin_high_risk'].isin(strong_positive).astype(int)
    
    return df

train = recode_age_bin_high_risk(train)
test = recode_age_bin_high_risk(test)

In [None]:
train['age_bin_high_risk_flag']

In [None]:
train['age_bin_favorable_flag']

In [294]:
# advanced imputation of columns with significant missing values
def advanced_imputation(train_df, test_df):
    # Features to impute using KNN (continuous variables with correlations)
    knn_features = ['tce_match', 'tce_imm_match', 'tce_div_match', 
                    'cyto_score', 'cyto_score_detail', 'mrd_hct', 'hla_match_c_high']
    
    # Features to impute using mode (categorical variables)
    mode_features = ['hla_match_drb1_high', 'hla_match_drb1_low',
                    'hla_match_dqb1_high', 'hla_match_dqb1_low',
                    'hla_high_res_6', 'hla_high_res_8', 'hla_high_res_10',
                    'hla_low_res_6', 'hla_low_res_8', 'hla_low_res_10']
    
    # Create copies to avoid modifying original
    train = train_df.copy()
    test = test_df.copy()
    
    # KNN Imputation for continuous variables
    scaler = StandardScaler()
    knn_imp = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')
    
    # Scale features for KNN imputation
    train_scaled = scaler.fit_transform(train[knn_features])
    test_scaled = scaler.transform(test[knn_features])
    
    # Perform KNN imputation
    train_imp = knn_imp.fit_transform(train_scaled)
    test_imp = knn_imp.transform(test_scaled)
    
    # Inverse transform to original scale
    train[knn_features] = scaler.inverse_transform(train_imp)
    test[knn_features] = scaler.inverse_transform(test_imp)
    
    # Mode imputation for categorical variables
    mode_imp = SimpleImputer(strategy='most_frequent')
    
    train[mode_features] = mode_imp.fit_transform(train[mode_features])
    test[mode_features] = mode_imp.transform(test[mode_features])
    
    return train, test

# Usage
train_imputed, test_imputed = advanced_imputation(train, test)

In [295]:
train = train_imputed
test = test_imputed

In [296]:
# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [297]:
# correlation matrix after feature engineering
#def plot_correlation_matrix(df, output_path):
#    df_copy = df.copy()   
#
#    numerical_cols = df_copy.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
#    
#    if len(numerical_cols) > 1:
#        plt.figure(figsize=(90, 90))
#        correlation_matrix = df_copy[numerical_cols].corr()
#        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
#        plt.title('Correlation Matrix After Feature Engineering')
#        plt.tight_layout()
#        plt.savefig(f'{output_path}/correlation_matrix_feature_engineering.png')
#        plt.show()
#        plt.close()

#plot_correlation_matrix(train, output_path)

In [None]:
def check_variances(df, percentile_threshold=20, relative_threshold=0.0001):
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
    
    # Calculate variances
    raw_variances = df[numerical_cols].var()
    
    print("\n1. Percentile-based check ({}th percentile):".format(percentile_threshold))
    percentile_value = np.percentile(raw_variances, percentile_threshold)
    percentile_low_var = raw_variances[raw_variances < percentile_value]
    print(percentile_low_var.sort_values())
    
    print("\n2. Relative variance check (threshold={}*max):".format(relative_threshold))
    relative_value = raw_variances.max() * relative_threshold
    relative_low_var = raw_variances[raw_variances < relative_value]
    print(relative_low_var.sort_values())
    
    # Print all variances sorted
    print("\nAll features sorted by variance (lowest to highest):")
    print(raw_variances.sort_values())

check_variances(train)

In [None]:
def plot_correlation_matrix(df, output_path):
    df_copy = df.copy()   
    
    # Features to potentially remove - add features to test
    low_variance_features = [
                    

                            ]

    # list of features to remove based on this analysis. This isn't used anywhere except to track which
    # features to remove based on variance and correlation
    features_to_remove = [
                        'hepatic_severe', 
                        'peptic_ulcer', 
                        'gvhd_proph',
                        'rheum_issue', 
                        'hla_high_low_ratio', 
                        'hla_ratio_res_highlow',
                        'no_prophylaxis',
                        'hla_ratio_res_lowhigh',
                        'has_MTX',
                        'dri_karnofsky',
                        'karnofsky_score',
                        'has_CSA',
                        'is_depletion_based',
                        'is_experimental',
                        'depletion_risk_interaction',
                        'hla_match_dqb1_mean',                         
                        'hla_match_dqb1_low',                       
                        'hla_match_dqb1_high',                        
                        'hla_match_c_low',                             
                        'hla_match_drb1_low',                          
                        'hla_min',                                      
                        'hla_match_drb1_high',                         
                        'hla_match_b_low',                                                     
                        'hla_match_a_low',                            
                        'hla_match_a_high',                             
                        'hla_match_b_high',
                        'sex_donor', 
                        'sex_recipient', 
                        'primary_agent',
                        'vivo_age_bin',
                        'vivo_prim_disease',
                        'dri_disease_status',
                        'transplant_era',
                        'age_bin_high_risk_flag',
                        'age_bin_to_donor_age_bin',
                        'pre_post_2015',
                        'hla_match_c_high',
                        'depletion_comorbidity_interaction',
                        'high_low_diff',                                
                        'hla_std',
                        'tce_div_match',                 
                        'tce_match',                          
                        'hla_mean',
                        'hla_high_res_6',                         
                        'hla_low_res_6',
                        'hla_nmdp_6',                                   
                        'hla_high_res_8',                             
                        'hla_low_res_8',                               
                        'hla_low_res_mean',                          
                        'hla_high_res_mean',
                        'tce_imm_match',                            
                        'hla_high_res_10',                            
                        'hla_low_res_10',
                        'years_since_2008',                            
                        'year_hct',
                        'comorbidity_score_by_age_at_hct',             
                        'comorbidity_age',                                                        
                        'depletion_age_interaction',
                        'compatibility_score_squared',
                        'donor_by_age_at_hct',
                        'karnofsky_score_x_compatibility_score',
                        'karnofsky_hla_interaction',
                        'karnofsky_score_x_donor_age',
                        'karnofsky_score_squared',
                        'karnofsky_weighted_performance',
                        'karnofsky_standardized',
                        'has_cyclophosphamide',
                        'immune_burden',
                        'vent_hist',
                        'compatibility_score',
                        'hla_mismatch_score',
                        'is_monotherapy',
                        'karnofsky_above_80',
                        'karnofsky_deviation',
                        'sex_match',
                        'donor_related', 
                        'melphalan_dose', 
                        'age_bin_race',
                        'graft_prod',
                        'graft_prim_disease',
                        'age_bin_dri',
                        'renal_issue', 
                        'prim_disease_hct', 
                        'cmv_status',
                        'rituximab',
                        'ethnicity',
                        'compatibility_treatment_intensity_fn',
                        'karnofsky_compatibility_fn',
                        'karnofsky_dri_comorbidity_fn',
                        'karnofsky_dri_fn',
                        'karnofsky_treatment_intensity_fn',
                        'conditioning_intensity_donor_age_bin',
                        'conditioning_intensity_race_group',
                        'conditioning_intensity_numeric_squared',
                        'dri_numeric_x_conditioning_intensity_numeric',
                        'treatment_intensity_x_conditioning_intensity_numeric',
                         ]

    # features tested and slated to keep
    features_to_keep = ['drb1_dqb1_ratio',
                        'disease_control_factor',
                        'is_high_risk',
                        'high_risk_combination',
                        'dri_age_squared',
                        'dri_quantile',
                        'risk_score_dri_weighted',
                        'drb1_dqb1_diff',
                        'age_quantile',
                        'risk_score_dri_weighted',
                        'risk_score_equal',
                        'comorbidity_quantile',
                        'has_combination',
                        'hla_match_drb1_mean', #cox only
                        'drb1_dqb1_diff',
                        'has_MMF', #cox high
                        'is_complex', #cox high
                        'has_FK', #cox high
                        'FK_MMF_interaction', #cox high
                        'is_standard_approach', #cox high
                        'is_standard_risk', #cox only, negative for others
                        'condition_severity',
                        'multiple_conditions',
                        'weighted_risk_score',
                        'mrd_hct', #cox only
                        'n_agents', #cox high
                        'dri_numeric',                              
                        'treatment_intensity',
                        'cyto_score_detail',
                        'cyto_score',
                        'hla_max',                                   
                        'hla_match_weighted',                          
                        'comorbidity_score',                            
                        'risk_depletion_category',                     
                        'combined_risk_score',
                        'dri_numeric_x_treatment_intensity',
                        'dri_numeric_squared',                         
                        'treatment_intensity_squared',                
                        'comorbidity_score_x_treatment_intensity',     
                        'comorbidity_score_x_dri_numeric',             
                        'dri_comorbidity',
                        'dqb1_low_interaction',  # cox only                     
                        'drb1_high_interaction', #cox only
                        'treatment_intensity_x_dri_comorbidity',       
                        'dri_numeric_x_compatibility_score',          
                        'compatibility_score_x_treatment_intensity',   
                        'dri_numeric_x_dri_comorbidity',                
                        'comorbidity_score_squared',
                        'comorbidity_score_x_compatibility_score',
                        'comorbidity_score_x_dri_comorbidity',                 
                        'compatibility_score_x_dri_comorbidity',       
                        'donor_age_x_dri_numeric',                   
                        'donor_age_x_treatment_intensity',             
                        'age_at_hct_x_treatment_intensity',          
                        'age_at_hct_x_dri_numeric',                    
                        'dri_comorbidity_squared',
                        'karnofsky_score_x_dri_numeric',                
                        'comorbidity_score_x_donor_age',             
                        'karnofsky_score_x_treatment_intensity',       
                        'age_comorbidity_interaction',
                        'age_at_hct_x_comorbidity_score',            
                        'treatment_intensity_x_dri_age',              
                        'donor_age_x_compatibility_score',             
                        'comorbidity_score_x_karnofsky_score',        
                        'dri_numeric_x_dri_age',
                        'age_at_hct_x_compatibility_score',           
                        'donor_age_x_dri_comorbidity',               
                        'age_at_hct_x_dri_comorbidity',               
                        'comorbidity_score_x_dri_age',
                        'karnofsky_score_x_dri_comorbidity',           
                        'compatibility_score_x_dri_age',              
                        'dri_age_x_dri_comorbidity',
                        'age_at_hct_x_donor_age',                    
                        'donor_age_squared',
                        'age_at_hct_squared',                         
                        'age_at_hct_x_karnofsky_score',
                        'karnofsky_age_at_hct',                       
                        'donor_age_x_dri_age',                         
                        'age_at_hct_x_dri_age',                        
                        'karnofsky_score_x_dri_age',                                       
                        'dri_age_squared',
                        'clinical_status_score',
                        'karnofsky_age_at_hct',
                        'karnofsky_risk_adjusted',
                        'combined_risk_score',
                        'risk_depletion_category',
                        'risk_score_equal',
                        'risk_score_dri_weighted',
                        'dri_score',
                        'dri_score_is_2',
                        'graft_dri_score_is_2',
                        'graft_risk_interaction',
                        'graft_type',
                        'optimal_conditioning',
                        'conditioning_intensity',
                        'race_group',
                        'age_bin',
                        'graft_cmv_status',
                        'karnofsky_category',
                        'disease_status_risk_score',
                        'age_bin_ethnicity', 
                        'dri_ethnicity',
                        'ethnicity_vivo',
                        'age_bin_favorable_flag',
                        'age_bin_risk_score',
                        'optimal_disease_status',
                        'age_bin_high_risk',
                        'age_risk_group',
                        'age_depletion_risk', 
                        'graft_dri',
                        'dri_score_grouped', 
                        'conditioning_group', 
                        'disease_status_group',
                        'vivo_comorbidity',

                        'age_bin_race_high_risk',
                        'donor_age_bin', 
                        'donor_age_bin_related',
                        'hla_match_total',
                        'karnofsky_below_70',
                        'psych_disturb',
                        'diabetes',
                        'tbi_status',
                        'arrhythmia',
                        'hepatic_mild',
                        'cardiac',
                        'pulm_moderate',
                        'cyto_age',
                        'age_bin_pulm_severe',
                        'with_tbi',
                        'pulm_severe',
                        'prod_type',
                        'obesity',
                        'in_vivo_tcd',
                        'prior_tumor',
                        'dri_comorbidity_fn',
                      	'dri_num_comorbidity_fn',
                        'dri_num_compatibility_fn',
                        'compatibility_dri_age_fn',
                        'compatibility_dri_comorbidity_fn',
                        'comorbidity_compatibility_fn',
                        'comorbidity_dri_age_fn',
                        'comorbidity_dri_comorbidity_fn',
                        'comorbidity_dri_numeric_fn',
                        'comorbidity_karnofsky_fn',
                        'comorbidity_treatment_intensity_fn',
                        'age_comorbidity_fn',
                        'age_compatibility_fn',
                        'age_dri_age_fn',
                        'age_dri_fn',
                        'age_karnofsky_fn',
                        'age_treatment_intensity_fn',
                        'karnofsky_dri_age_fn',
                        'treatment_intensity_dri_age_fn',
                        'treatment_intensity_dri_comorbidity_fn',


                        # new
                        'age_at_hct_x_clinical_status_score',
                        'age_at_hct_x_conditioning_intensity_numeric',
                        'age_at_hct_x_disease_control_factor',
                        'age_at_hct_x_dri_quantile',
                        'clinical_status_score_intensity',
                        'clinical_status_score_squared',
                        'clinical_status_score_x_conditioning_intensity_numeric',
                        'clinical_status_score_x_dri_quantile',
                        'comorbidity_score_x_clinical_status_score',
                        'comorbidity_score_x_conditioning_intensity_numeric',
                        'comorbidity_score_x_disease_control_factor',
                        'comorbidity_score_x_dri_quantile',
                        'compatibility_score_x_clinical_status_score',
                        'compatibility_score_x_conditioning_intensity_numeric',
                        'compatibility_score_x_disease_control_factor',
                        'compatibility_score_x_dri_quantile',
                        'rituximab_given_gvhd',
                        'cmv_donor',
                        'cmv_recipient',
                        'cmv_status_gvhd_proph',
                        'cmv_status_conditioning_intensity',
                        'conditioning_intensity_prim_disease',
                        'conditioning_intensity_age_bin',
                        'conditioning_intensity_cyto',
                        'conditioning_intensity_karnofsky',
                        'conditioning_intensity_gvhd_proph',
                        'conditioning_intensity_rituximab',
                        'conditioning_intensity_mrd_hct',
                        'conditioning_intensity_year',
                        'conditioning_intensity_tbi',
                        'conditioning_intensity_numeric',
                        'disease_control_factor_squared',
                        'disease_control_factor_x_clinical_status_score',
                        'disease_control_factor_x_conditioning_intensity_numeric',
                        'disease_control_factor_x_dri_quantile',
                        'donor_age_x_clinical_status_score',
                        'donor_age_x_conditioning_intensity_numeric',
                        'donor_age_x_disease_control_factor',
                        'donor_age_x_dri_quantile',
                        'dri_age_x_clinical_status_score',
                        'dri_age_x_conditioning_intensity_numeric',
                        'dri_age_x_disease_control_factor',
                        'dri_age_x_dri_quantile',
                        'dri_comorbidity_x_clinical_status_score',
                        'dri_comorbidity_x_conditioning_intensity_numeric',
                        'dri_comorbidity_x_disease_control_factor',
                        'dri_comorbidity_x_dri_quantile',
                        'dri_numeric_x_clinical_status_score',
                        'dri_numeric_x_disease_control_factor',
                        'dri_numeric_x_dri_quantile',
                        'dri_quantile_squared',
                        'dri_quantile_x_conditioning_intensity_numeric',
                        'karnofsky_score_x_clinical_status_score',
                        'karnofsky_score_x_conditioning_intensity_numeric',
                        'karnofsky_score_x_disease_control_factor',
                        'karnofsky_score_x_dri_quantile',
                        'treatment_intensity_x_clinical_status_score',
                        'treatment_intensity_x_disease_control_factor',
                        'treatment_intensity_x_dri_quantile'
                       ]
    
    # Important features to keep
    key_features = [
        'age_at_hct',
        'donor_age',
        'dri_age'
    ]
    
    # Target variables
    target_variables = ['y_na', 'y_km', 'y_cox']
    
    # Combine features and targets
    features_to_analyze = low_variance_features + key_features + target_variables
    
    if len(features_to_analyze) > 1:
        plt.figure(figsize=(16, 8))
        
        # Create correlation matrix
        correlation_matrix = df_copy[features_to_analyze].corr()
        
        # Create mask to highlight target correlations
        mask = np.zeros_like(correlation_matrix)
        mask[:-3, :-3] = True  # Mask everything except target correlations
        
        # Plot heatmap
        sns.heatmap(correlation_matrix, 
                   annot=True, 
                   cmap='coolwarm', 
                   center=0,
                   fmt='.2f',
                   mask=mask,
                   vmin=-1, 
                   vmax=1)
        
        plt.title('Feature-Target Correlations')
        plt.tight_layout()
        plt.savefig(f'{output_path}/target_correlation_matrix.png', bbox_inches='tight')
        plt.show()
        plt.close()
        
        # Print sorted correlations with targets
        print("\nCorrelations with targets (sorted):")
        for target in target_variables:
            print(f"\n{target} correlations:")
            target_corr = correlation_matrix[target].sort_values(ascending=False)
            print(target_corr[target_corr.index != target])  # Exclude self-correlation

    return features_to_keep, features_to_remove, key_features

features_to_keep, features_to_remove, key_features = plot_correlation_matrix(train, output_path)

In [None]:
def find_unchecked_features(df, features_to_keep, features_to_remove):
    # Get all columns from dataframe
    all_columns = set(df.columns)
    
    # Combine your checked features
    checked_features = set(features_to_keep + features_to_remove + key_features)
    
    # Find columns that haven't been checked
    unchecked_columns = all_columns - checked_features
    
    if unchecked_columns:
        print("\nUnchecked columns found:")
        print("------------------------")
        for col in sorted(unchecked_columns):
            print(f"{col} - dtype: {df[col].dtype}")
    else:
        print("\nAll columns have been checked!")
        
    return unchecked_columns

# Use the function
unchecked = find_unchecked_features(train, features_to_keep, features_to_remove)

In [None]:
def check_type_relationship(df):
    # Check overlap
    print("Cross-tabulation of prod_type vs graft_type:")
    print(pd.crosstab(df['prod_type'], df['graft_type']))
    
    # Calculate correlation
    print("\nCorrelation between prod_type and graft_type:")
    print(df['prod_type'].corr(df['graft_type']))

check_type_relationship(train)

In [None]:
def analyze_categorical_correlations(df, features_to_keep, features_to_remove, target_variables=['y_na', 'y_km', 'y_cox']):
    # Get all categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    # Remove any categorical columns that are already in keep or remove lists
    checked_features = set(features_to_keep + features_to_remove)
    unchecked_categorical = [col for col in categorical_cols if col not in checked_features]
    
    print("Analyzing these categorical columns:", unchecked_categorical)
    
    # Create dummy variables for each categorical feature
    correlations_by_feature = {}
    
    for cat_col in unchecked_categorical:
        print(f"\nAnalyzing {cat_col}")
        print(f"Unique values: {df[cat_col].nunique()}")
        print(f"Value distribution:\n{df[cat_col].value_counts()}\n")
        
        # Create dummy variables
        dummies = pd.get_dummies(df[cat_col], prefix=cat_col)
        
        # Calculate correlations with targets
        correlations = {}
        for target in target_variables:
            target_correlations = {}
            for dummy_col in dummies.columns:
                corr = dummies[dummy_col].corr(df[target])
                target_correlations[dummy_col] = corr
            correlations[target] = target_correlations
        
        correlations_by_feature[cat_col] = correlations
        
        # Print correlations in a readable format
        print(f"Correlations for {cat_col}:")
        for target, corrs in correlations.items():
            print(f"\n{target}:")
            # Sort correlations by absolute value
            sorted_corrs = sorted(corrs.items(), key=lambda x: abs(x[1]), reverse=True)
            for dummy_col, corr in sorted_corrs:
                print(f"  {dummy_col}: {corr:.3f}")
        print("-" * 50)
    
    return correlations_by_feature

# Use the function
categorical_correlations = analyze_categorical_correlations(train, features_to_keep, features_to_remove)

In [None]:
def analyze_depletion_outcomes(df):
    # Basic distribution
    print("Distribution of is_depletion_based:")
    print(df['is_depletion_based'].value_counts(normalize=True))
    
    # Create boxplots for each outcome
    plt.figure(figsize=(12, 4))
    
    # Boxplots for each target variable
    targets = ['y_na', 'y_km', 'y_cox']
    for i, target in enumerate(targets, 1):
        plt.subplot(1, 3, i)
        sns.boxplot(x='is_depletion_based', y=target, data=df)
        plt.title(f'{target} by Depletion Status')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical tests
    for target in targets:
        depletion_group = df[df['is_depletion_based'] == 1][target]
        non_depletion_group = df[df['is_depletion_based'] == 0][target]
        
        # Perform Mann-Whitney U test
        stat, pval = stats.mannwhitneyu(depletion_group, non_depletion_group)
        print(f"\nMann-Whitney U test for {target}:")
        print(f"p-value: {pval:.4f}")
        
        # Calculate mean values for each group
        print(f"Mean for depletion group: {depletion_group.mean():.4f}")
        print(f"Mean for non-depletion group: {non_depletion_group.mean():.4f}")

analyze_depletion_outcomes(train)

In [None]:
def plot_survival_curves(df):
    from lifelines import KaplanMeierFitter
    
    kmf = KaplanMeierFitter()
    
    plt.figure(figsize=(10, 6))
    
    # Plot for depletion group
    mask_depletion = df['is_depletion_based'] == 1
    kmf.fit(df[mask_depletion]['efs_time'], 
            df[mask_depletion]['efs'],
            label='Depletion-based')
    kmf.plot()
    
    # Plot for non-depletion group
    mask_non_depletion = df['is_depletion_based'] == 0
    kmf.fit(df[mask_non_depletion]['efs_time'],
            df[mask_non_depletion]['efs'],
            label='Non-depletion')
    kmf.plot()
    
    plt.title('Survival Curves by Depletion Status')
    plt.xlabel('Time')
    plt.ylabel('Survival Probability')
    plt.grid(True)
    plt.savefig(f'{output_path}/is_depletion_based_survival_curve_km.png', bbox_inches='tight')
    plt.show()

# Run all analyses
plot_survival_curves(train)

In [None]:
def analyze_selection_bias(df):
    # Key clinical characteristics that might influence treatment selection
    clinical_vars = [
        'age_at_hct', 
        'karnofsky_score',
        'is_high_risk',
        'condition_severity',
        'comorbidity_score',
        'dri_numeric',  # Disease Risk Index
        'compatibility_score'
    ]
    
    print("1. Distribution Analysis:")
    for var in clinical_vars:
        # Compare means between groups
        depletion = df[df['is_depletion_based'] == 1][var]
        non_depletion = df[df['is_depletion_based'] == 0][var]
        
        print(f"\n{var}:")
        print(f"Depletion group mean: {depletion.mean():.2f}")
        print(f"Non-depletion group mean: {non_depletion.mean():.2f}")
        
        # Statistical test
        stat, pval = stats.mannwhitneyu(depletion, non_depletion)
        print(f"p-value: {pval:.4f}")
    
    # Visualize key relationships
    plt.figure(figsize=(15, 5))
    
    # Plot 1: Age vs Karnofsky by depletion status
    plt.subplot(131)
    sns.scatterplot(data=df, x='age_at_hct', y='karnofsky_score', 
                    hue='is_depletion_based', alpha=0.5)
    plt.title('Age vs Karnofsky Score')
    
    # Plot 2: Risk factors
    plt.subplot(132)
    sns.boxplot(data=df, x='is_depletion_based', y='dri_numeric')
    plt.title('Disease Risk Index by Depletion Status')
    
    # Plot 3: Comorbidity distribution
    plt.subplot(133)
    sns.boxplot(data=df, x='is_depletion_based', y='comorbidity_score')
    plt.title('Comorbidity Score by Depletion Status')
    
    plt.tight_layout()
    plt.savefig(f'{output_path}/is_depletion_based_selection_bias.png', bbox_inches='tight')
    plt.show()

analyze_selection_bias(train)

In [306]:
SEED = 9365

def perform_pca(train, test, n_components=None, random_state=42):
    # Remove rows with NaN values from both datasets
    train = train.dropna()
    test = test.dropna()

    pca = PCA(n_components=n_components, random_state=random_state)
    train_pca = pca.fit_transform(train)
    test_pca = pca.transform(test)
    
    explained_variance_ratio = pca.explained_variance_ratio_
    print(f"Explained variance ratio of the components:\n {explained_variance_ratio}")
    print(np.sum(explained_variance_ratio))
    
    train_pca_df = pd.DataFrame(train_pca, columns=[f'PC_{i+1}' for i in range(train_pca.shape[1])])
    test_pca_df = pd.DataFrame(test_pca, columns=[f'PC_{i+1}' for i in range(test_pca.shape[1])])
    
    return train_pca_df, test_pca_df, pca

In [307]:
train_num = train.drop('ID', axis=1)
test_num = test.drop('ID', axis=1)

In [None]:
train_num.head()

In [309]:
RMV = ["ID","efs","efs_time","y_cox", "y_km", "y_na"]

# PCA 
# Extract the numerical columns to be used in the PCA
train_num = train.drop(RMV, axis=1)
test_num = test.drop('ID', axis=1)

# Get numeric and categorical columns
numeric_columns = train_num.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns
categorical_columns = train_num.select_dtypes(exclude=['int32', 'float32', 'int64', 'float64']).columns

# Split into numeric and categorical dataframes
train_numeric = train_num[numeric_columns]
test_numeric = test_num[numeric_columns]
train_categorical = train[categorical_columns]
test_categorical = test[categorical_columns]

# Scale the numeric columns
scaler = StandardScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_numeric),
    columns=train_numeric.columns
)
test_scaled = pd.DataFrame(
    scaler.transform(test_numeric),
    columns=test_numeric.columns
)

#train_pca, test_pca, pca = perform_pca(train_scaled, test_scaled, n_components=15, random_state=SEED)

train_rmv = train[RMV]

# Merge scaled numeric data with categorical data
train_final = pd.concat([train_scaled, train_categorical, train_rmv], axis=1)
test_final = pd.concat([test_scaled, test_categorical], axis=1)

In [310]:
test = test_final
train = train_final

FEATURES = train.columns

In [None]:
print(sorted(list(train.columns)))

In [None]:
train.dtypes

In [313]:
# convert the recoded columns to proper encoded categories
new_cats = ['age_risk_group', 'dri_score_grouped', 'conditioning_group', 'disease_status_group']

for col in new_cats:
    # Ensure categories are coded as integers starting from 0
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
train.head()

In [None]:
# Check for inf values in the dataframe
inf_columns = train.columns[train.isin([np.inf, -np.inf, np.nan]).any()].tolist()

print("Columns containing infinite or NaN values:")
if inf_columns:
    for col in inf_columns:
        print(f"- {col}: {train[col].isin([np.inf, -np.inf, np.nan]).sum()} infinite or NaN values")
else:
    print("No infinite values or NaN values found in any column")

In [316]:
final_features = features_to_keep + key_features

In [None]:
print(sorted(list(final_features)))

# XGBoost with KaplanMeier

In [None]:
# XGBoost with KaplanMeier
print("Using XGBoost version",xgb.__version__)

In [374]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    #'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    #'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    #'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [375]:
class XGBoostModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_xgb_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'max_depth': trial.suggest_int('max_depth', 3, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.8, 0.99),
            'n_estimators': trial.suggest_int('n_estimators', 1500, 1900),
            'learning_rate': trial.suggest_float('learning_rate', 0.009, 0.04, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 50, 100),
            'device': 'cuda',
            'eval_metric': 'rmse',
            'objective': 'reg:squarederror',
            'enable_categorical': True
        }
        
        model = XGBRegressor(**param)
        model.fit(
            train_data[features], train_data['y_km'],
            eval_set=[(valid_data[features], valid_data['y_km'])],
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, XGBRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_xgb: Predictions for test set
            - oof_xgb: Out-of-fold predictions for training set
            - xgb_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_xgb = np.zeros(len(train))
        pred_xgb = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'max_depth': 4,
                    'colsample_bytree': 0.19666394794366462,
                    'subsample': 0.958113786472804,
                    'n_estimators': 1702,
                    'learning_rate': 0.017521549387146534,
                    'min_child_weight': 60,
                    'eval_metric': 'rmse',
                    'objective': 'reg:squarederror',
                    'booster': 'gbtree'
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_km']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_km']
            x_test = test[features]

            model_params = {
                **self.best_params,
                'device': 'cuda',
                'enable_categorical': True
            }
            
            model = XGBRegressor(**model_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                verbose=False
            )
            
            if i == 0:
                best_model = model

            oof_xgb[valid_idx] = model.predict(x_valid)
            pred_xgb += model.predict(x_test)

        pred_xgb /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_xgb
        xgb_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {xgb_score}")
        
        return pred_xgb, oof_xgb, xgb_score, best_model

In [None]:
# Initialize the model
xgb_model = XGBoostModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_xgb, oof_xgb, xgb_score, model_xgb = xgb_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

# save the model
model_xgb.save_model(f'{output_path}/xgboost_km_model_{xgb_score}.json')

In [None]:
feature_importance = model_xgb.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": xgb_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/xgboost_km_feature_importance_{xgb_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/xgboost_km_feature_importance_{xgb_score}.png')
plt.show()

# CatBoost with KaplanMeier

In [None]:
# CatBoost with KaplanMeier
print("Using CatBoost version",cb.__version__)

In [324]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [325]:
class CatBoostModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_catboost_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list, cat_features: List[str]) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'task_type': 'GPU',
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'grow_policy': trial.suggest_categorical('grow_policy', ['Lossguide', 'Depthwise']),
            'depth': trial.suggest_int('depth', 5, 9),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
            'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli']),
            'subsample': trial.suggest_float('subsample', 0.6, 0.99),
            'early_stopping_rounds': 50,
            'verbose': 0
        }
        
        # Handle special parameter dependencies
        if param['bootstrap_type'] == 'Bayesian':
            param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 10.0)
        
        model = CatBoostRegressor(**param)
        model.fit(
            train_data[features], train_data['y_km'],
            eval_set=(valid_data[features], valid_data['y_km']),
            cat_features=cat_features,
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        cat_features: List[str],
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, CatBoostRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            cat_features: List of categorical feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_cat: Predictions for test set
            - oof_cat: Out-of-fold predictions for training set
            - cat_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_cat = np.zeros(len(train))
        pred_cat = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(
                    trial, train_fold, valid_fold, features, cat_features
                ),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            # Add fixed parameters
            self.best_params.update({
                'task_type': 'GPU',
                'early_stopping_rounds': 50
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'task_type': 'GPU',
                    'learning_rate': 0.0271856268721739,
                    'grow_policy': 'Depthwise',
                    'depth': 8,
                    'min_data_in_leaf': 28,
                    'l2_leaf_reg': 7.116371413544365,
                    'random_strength': 8.837483768947592,
                    'n_estimators': 1019,
                    'bootstrap_type': 'Bernoulli',
                    'subsample': 0.8403901314200961,
                    'early_stopping_rounds': 50
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_km']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_km']
            x_test = test[features]
            
            model = CatBoostRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=(x_valid, y_valid),
                cat_features=cat_features,
                verbose=250
            )
            
            if i == 0:
                best_model = model

            oof_cat[valid_idx] = model.predict(x_valid)
            pred_cat += model.predict(x_test)

        pred_cat /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_cat
        cat_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {cat_score}")
        
        return pred_cat, oof_cat, cat_score, best_model

In [None]:
# Initialize the model
cat_model = CatBoostModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_cat, oof_cat, cat_score, model_cat = cat_model.train_and_predict(
    train, test, FEATURES, CATS, tune_hyperparameters=False
)

# save the model
model_cat.save_model(f'{output_path}/catboost_km_model_{cat_score}.json')

In [None]:
feature_importance = model_cat.get_feature_importance()
importance_df = pd.DataFrame({
    "Feature": FEATURES, 
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": cat_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/catboost_km_feature_importance_{cat_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("CatBoost KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/catboost_km_feature_importance_{cat_score}.png')
plt.show()

# LightGBM with KaplanMeier

In [None]:
# LightGBM with KaplanMeier
print("Using LightGBM version",lgb.__version__)

In [329]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [330]:
class LightGBMModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_lgb_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'device': 'gpu',
            'gpu_use_dp': True,
            'num_leaves': trial.suggest_int('num_leaves', 15, 63),
            'max_depth': trial.suggest_int('max_depth', 2, 6),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.5),
            'subsample': trial.suggest_float('subsample', 0.6, 0.99),
            'n_estimators': trial.suggest_int('n_estimators', 600, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 40),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'objective': 'regression',
            'verbose': -1
        }
        
        model = LGBMRegressor(**param)
        model.fit(
            train_data[features], train_data['y_km'],
            eval_set=[(valid_data[features], valid_data['y_km'])]
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, LGBMRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_lgb: Predictions for test set
            - oof_lgb: Out-of-fold predictions for training set
            - lgb_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_lgb = np.zeros(len(train))
        pred_lgb = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            # Add fixed parameters
            self.best_params.update({
                'device': 'gpu',
                'gpu_use_dp': True,
                'objective': 'regression'
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'device': 'gpu',
                    'num_leaves': 60,
                    'max_depth': 3,
                    'colsample_bytree': 0.2,
                    'subsample': 0.708491288060911,
                    'n_estimators': 1518,
                    'learning_rate': 0.011957265339774753,
                    'min_child_samples': 15,
                    'reg_alpha': 1.2603421132422994,
                    'reg_lambda': 8.423727640917658,
                    'objective': 'regression',
                    'gpu_use_dp': True,
                    'verbose': -1
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_km']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_km']
            x_test = test[features]
            
            model = LGBMRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)]
            )
            
            if i == 0:
                best_model = model

            oof_lgb[valid_idx] = model.predict(x_valid)
            pred_lgb += model.predict(x_test)

        pred_lgb /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_lgb
        lgb_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {lgb_score}")
        
        return pred_lgb, oof_lgb, lgb_score, best_model

In [None]:
# Initialize the model
lgb_model = LightGBMModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_lgb, oof_lgb, lgb_score, model_lgb = lgb_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

In [None]:
feature_importance = model_lgb.feature_importances_ 
importance_df = pd.DataFrame({
    "Feature": FEATURES,
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": lgb_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/lightgbm_km_feature_importance_{lgb_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"], color='skyblue')
plt.xlabel("Importance (Gain)")
plt.ylabel("Feature")
plt.title("LightGBM KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/lightgbm_km_feature_importance_{lgb_score}.png')
plt.show()

# XGBoost with Survival:Cox

In [333]:
# XGBoost with Survival:Cox
# SURVIVAL COX NEEDS THIS TARGET (TO DIGEST EFS AND EFS_TIME)
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

In [334]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [335]:
class XGBoostCoxModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_xgb_cox_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'device': 'cuda',
            'max_depth': trial.suggest_int('max_depth', 4, 8),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.5),
            'subsample': trial.suggest_float('subsample', 0.8, 0.99),
            'n_estimators': trial.suggest_int('n_estimators', 2000, 4000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 40, 80),
            'enable_categorical': True,
            'objective': 'survival:cox',
            'eval_metric': 'cox-nloglik'
        }
        
        model = XGBRegressor(**param)
        model.fit(
            train_data[features], train_data['efs_time2'],
            eval_set=[(valid_data[features], valid_data['efs_time2'])],
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, XGBRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_xgb_cox: Predictions for test set
            - oof_xgb_cox: Out-of-fold predictions for training set
            - xgb_cox_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_xgb_cox = np.zeros(len(train))
        pred_xgb_cox = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            self.best_params.update({
                'device': 'cuda',
                'enable_categorical': True,
                'objective': 'survival:cox',
                'eval_metric': 'cox-nloglik'
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'device': 'cuda',
                    'max_depth': 7,
                    'colsample_bytree': 0.3086762365744495,
                    'subsample': 0.9868819384956341,
                    'n_estimators': 2810,
                    'learning_rate': 0.013549777860376738,
                    'enable_categorical': True,
                    'min_child_weight': 56,
                    'objective': 'survival:cox',
                    'eval_metric': 'cox-nloglik'
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['efs_time2']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['efs_time2']
            x_test = test[features]
            
            model = XGBRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                verbose=500
            )
            
            if i == 0:
                best_model = model

            oof_xgb_cox[valid_idx] = model.predict(x_valid)
            pred_xgb_cox += model.predict(x_test)

        pred_xgb_cox /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_xgb_cox
        xgb_cox_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {xgb_cox_score}")
        
        return pred_xgb_cox, oof_xgb_cox, xgb_cox_score, best_model

In [None]:
# Initialize the model
xgb_cox_model = XGBoostCoxModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_xgb_cox, oof_xgb_cox, xgb_cox_score, model_xgb_cox = xgb_cox_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

# save the model
model_xgb_cox.save_model(f'{output_path}/xgboost_cox_model_{xgb_cox_score}.json')

In [None]:
feature_importance = model_xgb_cox.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": xgb_cox_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/xgboost_cox_feature_importance_{xgb_cox_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost Survival:Cox Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/xgboost_cox_feature_importance_{xgb_cox_score}.png')
plt.show()

# CatBoost with Survival:Cox

In [338]:
# CatBoost with Survival:Cox
# feature selection
FEATURES =[
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [339]:
class CatBoostCoxModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_catboost_cox_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list, cat_features: List[str]) -> float:
        param = {
            'loss_function': 'Cox',
            'iterations': trial.suggest_int('iterations', 800, 1200),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'grow_policy': trial.suggest_categorical('grow_policy', ['Lossguide', 'Depthwise']),
            'depth': trial.suggest_int('depth', 4, 8),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 4.0, 10.0),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 30, 80),
            'rsm': trial.suggest_float('rsm', 0.1, 0.2),
            'use_best_model': False,
            'early_stopping_rounds': 100
        }
        
        model = CatBoostRegressor(**param)
        model.fit(
            train_data[features], train_data['efs_time2'],
            eval_set=(valid_data[features], valid_data['efs_time2']),
            cat_features=cat_features,
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        cat_features: List[str],
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, CatBoostRegressor]:
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_cat_cox = np.zeros(len(train))
        pred_cat_cox = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(
                    trial, train_fold, valid_fold, features, cat_features
                ),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            self.best_params.update({
                'loss_function': 'Cox',
                'use_best_model': False,
                'early_stopping_rounds': 100
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                self.best_params = {
                    'loss_function': 'Cox',
                    'iterations': 1173,
                    'learning_rate': 0.0427748557274786,
                    'grow_policy': 'Depthwise',
                    'use_best_model': False,
                    'early_stopping_rounds': 100,
                    'depth': 8,
                    'min_data_in_leaf': 34,
                    'rsm': 0.10440070890268134, 
                    'l2_leaf_reg': 8.027755811547786
                }

        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['efs_time2']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['efs_time2']
            x_test = test[features]
            
            model = CatBoostRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=(x_valid, y_valid),
                cat_features=cat_features,
                verbose=100
            )
            
            if i == 0:
                best_model = model

            oof_cat_cox[valid_idx] = model.predict(x_valid)
            pred_cat_cox += model.predict(x_test)

        pred_cat_cox /= self.n_folds
        
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_cat_cox
        cat_cox_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {cat_cox_score}")
        
        return pred_cat_cox, oof_cat_cox, cat_cox_score, best_model

In [None]:
# Initialize the model
cat_cox_model = CatBoostCoxModel(n_folds=10, n_trials=400)

pred_cat_cox, oof_cat_cox, cat_cox_score, model_cat_cox = cat_cox_model.train_and_predict(
    train, test, FEATURES, CATS, tune_hyperparameters=False
)

# save the model
model_cat_cox.save_model(f'{output_path}/catboost_cox_model_{cat_cox_score}.json')

In [None]:
feature_importance = model_cat_cox.get_feature_importance()
importance_df = pd.DataFrame({
    "Feature": FEATURES, 
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": cat_cox_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/catboost_cox_feature_importance_{cat_cox_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)


plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("CatBoost Survival:Cox Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/catboost_cox_feature_importance_{cat_cox_score}.png')
plt.show()

# LightGBM with Cox

In [None]:
# LightGBM with Cox
print("Using LightGBM version",lgb.__version__)

In [343]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [344]:
class LightGBMModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_lgb_cox_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'device': 'gpu',
            'gpu_use_dp': True,
            'objective': 'regression',
            #'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
            'min_child_samples': trial.suggest_int('min_child_samples', 24, 100),
            #'num_iterations': trial.suggest_int('num_iterations', 2000, 6000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'extra_trees': True,
            'reg_lambda': trial.suggest_float('reg_lambda', 6.0, 10.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 30, 80),
            'metric': 'rmse',
            'max_depth': trial.suggest_int('max_depth', 5, 9),
            'max_bin': trial.suggest_int('max_bin', 100, 200),
            'verbose': -1,
            'seed': 42
        }
        
        model = LGBMRegressor(**param)
        model.fit(
            train_data[features], train_data['efs_time2'],
            eval_set=[(valid_data[features], valid_data['efs_time2'])]
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, LGBMRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_lgb: Predictions for test set
            - oof_lgb: Out-of-fold predictions for training set
            - lgb_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_lgb_cox = np.zeros(len(train))
        pred_lgb_cox = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            # Add fixed parameters
            self.best_params.update({
                'device': 'cpu',
                #'gpu_use_dp': True,
                'objective': 'regression'
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'objective': 'regression',
                    'min_child_samples': 41,
                    #'num_iterations': 6000,
                    'learning_rate': 0.061942323887844736,
                    'extra_trees': True,
                    'reg_lambda': 7.453459703925752,
                    'reg_alpha': 0.04971354334359003,
                    'num_leaves': 70,
                    'metric': 'rmse',
                    'max_depth': 8,
                    'device': 'cpu',
                    'max_bin': 115,
                    'verbose': -1,
                    'seed': 42
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['efs_time2']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['efs_time2']
            x_test = test[features]
            
            model = LGBMRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)]
            )
            
            if i == 0:
                best_model = model

            oof_lgb_cox[valid_idx] = model.predict(x_valid)
            pred_lgb_cox += model.predict(x_test)

        pred_lgb_cox /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_lgb_cox
        lgb_cox_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {lgb_cox_score}")
        
        return pred_lgb_cox, oof_lgb_cox, lgb_cox_score, best_model

In [None]:
# Initialize the model
lgb_cox_model = LightGBMModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_lgb_cox, oof_lgb_cox, lgb_cox_score, model_lgb_cox = lgb_cox_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

In [None]:
feature_importance = model_lgb_cox.feature_importances_ 
importance_df = pd.DataFrame({
    "Feature": FEATURES,
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": lgb_cox_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/lightgbm_cox_feature_importance_{lgb_cox_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"], color='skyblue')
plt.xlabel("Importance (Gain)")
plt.ylabel("Feature")
plt.title("LightGBM Cox Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/lightgbm_cox_feature_importance_{lgb_cox_score}.png')
plt.show()

# XGBoost with Nelson Aalen

In [None]:
# XGBoost with Nelson Aalen
print("Using XGBoost version",xgb.__version__)

In [348]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [349]:
class XGBoostModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_xgb_na_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'max_depth': trial.suggest_int('max_depth', 4, 7),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.08, 0.2),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'n_estimators': trial.suggest_int('n_estimators', 600, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 80, 140),
            'device': 'cuda',
            'enable_categorical': True
        }
        
        model = XGBRegressor(**param)
        model.fit(
            train_data[features], train_data['y_na'],
            eval_set=[(valid_data[features], valid_data['y_na'])],
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, XGBRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_xgb: Predictions for test set
            - oof_xgb: Out-of-fold predictions for training set
            - xgb_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_xgb_na = np.zeros(len(train))
        pred_xgb_na = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'max_depth': 7,
                    'colsample_bytree': 0.1095622657488886,
                    'subsample': 0.8706654993908751,
                    'n_estimators': 817,
                    'learning_rate': 0.021020104462379174,
                    'min_child_weight': 99,
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_km']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_km']
            x_test = test[features]

            model_params = {
                **self.best_params,
                'device': 'cuda',
                'enable_categorical': True
            }
            
            model = XGBRegressor(**model_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                verbose=500
            )
            
            if i == 0:
                best_model = model

            oof_xgb_na[valid_idx] = model.predict(x_valid)
            pred_xgb_na += model.predict(x_test)

        pred_xgb_na /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_xgb_na
        xgb_na_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {xgb_na_score}")
        
        return pred_xgb_na, oof_xgb_na, xgb_na_score, best_model

In [None]:
# Initialize the model
xgb_na_model = XGBoostModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_xgb_na, oof_xgb_na, xgb_na_score, model_xgb_na = xgb_na_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

# save the model
model_xgb_na.save_model(f'{output_path}/xgboost_na_model_{xgb_na_score}.json')

In [None]:
feature_importance = model_xgb_na.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": xgb_na_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/xgboost_na_feature_importance_{xgb_na_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost Nelson Aalen Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/xgboost_na_feature_importance_{xgb_na_score}.png')
plt.show()

# CatBoost with Nelson Aalen

In [352]:
# CatBoost with Nelson Aalen
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [353]:
class CatBoostModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_catboost_na_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list, cat_features: List[str]) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'task_type': 'GPU',
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
            'grow_policy': trial.suggest_categorical('grow_policy', ['Lossguide']),
            'depth': trial.suggest_int('depth', 2, 8),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
            'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
            'n_estimators': trial.suggest_int('n_estimators', 600, 2000),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli']),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'early_stopping_rounds': 50,
            'verbose': 0
        }
        
        # Handle special parameter dependencies
        if param['bootstrap_type'] == 'Bayesian':
            param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 10.0)
        
        model = CatBoostRegressor(**param)
        model.fit(
            train_data[features], train_data['y_na'],
            eval_set=(valid_data[features], valid_data['y_na']),
            cat_features=cat_features,
            verbose=0
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        cat_features: List[str],
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, CatBoostRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            cat_features: List of categorical feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_cat: Predictions for test set
            - oof_cat: Out-of-fold predictions for training set
            - cat_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_cat_na = np.zeros(len(train))
        pred_cat_na = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(
                    trial, train_fold, valid_fold, features, cat_features
                ),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            # Add fixed parameters
            self.best_params.update({
                'task_type': 'GPU',
                'early_stopping_rounds': 50
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'task_type': 'GPU',
                    'learning_rate': 0.05491682916054387,
                    'grow_policy': 'Lossguide',
                    'depth': 6,
                    'min_data_in_leaf': 5,
                    'l2_leaf_reg': 1.9599483952785932,
                    'random_strength': 6.957242248187078,
                    'n_estimators': 1870,
                    'bootstrap_type': 'Bernoulli',
                    'subsample': 0.8646459088430101,
                    'early_stopping_rounds': 50
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_na']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_na']
            x_test = test[features]
            
            model = CatBoostRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=(x_valid, y_valid),
                cat_features=cat_features,
                verbose=250
            )
            
            if i == 0:
                best_model = model

            oof_cat_na[valid_idx] = model.predict(x_valid)
            pred_cat_na += model.predict(x_test)

        pred_cat_na /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_cat_na
        cat_na_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {cat_na_score}")
        
        return pred_cat_na, oof_cat_na, cat_na_score, best_model

In [None]:
# Initialize the model
cat_na_model = CatBoostModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_cat_na, oof_cat_na, cat_na_score, model_cat_na = cat_na_model.train_and_predict(
    train, test, FEATURES, CATS, tune_hyperparameters=False
)

# save the model
model_cat_na.save_model(f'{output_path}/catboost_na_model_{cat_na_score}.json')

In [None]:
feature_importance = model_cat_na.get_feature_importance()
importance_df = pd.DataFrame({
    "Feature": FEATURES, 
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": cat_na_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/catboost_na_feature_importance_{cat_na_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("CatBoost Nelson Aalen Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/catboost_na_feature_importance_{cat_na_score}.png')
plt.show()

# LightGBM with Nelson Aalen

In [None]:
# LightGBM with Nelson Aalen
print("Using LightGBM version",lgb.__version__)

In [357]:
# feature selection
FEATURES = [
    'age_at_hct',
    'age_at_hct_squared',
    'age_at_hct_x_clinical_status_score',
    'age_at_hct_x_comorbidity_score',
    'age_at_hct_x_compatibility_score',
    'age_at_hct_x_conditioning_intensity_numeric',
    'age_at_hct_x_disease_control_factor',
    'age_at_hct_x_donor_age',
    'age_at_hct_x_dri_age',
    'age_at_hct_x_dri_comorbidity',
    'age_at_hct_x_dri_numeric',
    'age_at_hct_x_dri_quantile',
    'age_at_hct_x_karnofsky_score',
    'age_at_hct_x_treatment_intensity',
    'age_bin',
    'age_bin_dri',
    'age_bin_ethnicity',
    'age_bin_favorable_flag',
    'age_bin_high_risk',
    'age_bin_high_risk_flag',
    'age_bin_pulm_severe',
    'age_bin_race',
    'age_bin_race_high_risk',
    'age_bin_risk_score',
    'age_bin_to_donor_age_bin',
    'age_comorbidity_fn',
    'age_comorbidity_interaction',
    'age_compatibility_fn',
    'age_depletion_risk',
    'age_dri_age_fn',
    'age_dri_fn',
    'age_karnofsky_fn',
    'age_quantile',
    'age_risk_group',
    'age_treatment_intensity_fn',
    'arrhythmia',
    'cardiac',
    'clinical_status_score',
    'clinical_status_score_intensity',
    'clinical_status_score_squared',
    'clinical_status_score_x_conditioning_intensity_numeric',
    'clinical_status_score_x_dri_quantile',
    'cmv_donor',
    'cmv_recipient',
    'cmv_status',
    'cmv_status_conditioning_intensity',
    'cmv_status_gvhd_proph',
    'combined_risk_score',
    'comorbidity_age',
    'comorbidity_compatibility_fn',
    'comorbidity_score_by_age_at_hct',
    'comorbidity_dri_age_fn',
    'comorbidity_dri_comorbidity_fn',
    'comorbidity_dri_numeric_fn',
    'comorbidity_karnofsky_fn',
    'comorbidity_quantile',
    'comorbidity_score',
    'comorbidity_score_squared',
    'comorbidity_score_x_clinical_status_score',
    'comorbidity_score_x_compatibility_score',
    'comorbidity_score_x_conditioning_intensity_numeric',
    'comorbidity_score_x_disease_control_factor',
    'comorbidity_score_x_donor_age',
    'comorbidity_score_x_dri_age',
    'comorbidity_score_x_dri_comorbidity',
    'comorbidity_score_x_dri_numeric',
    'comorbidity_score_x_dri_quantile',
    'comorbidity_score_x_karnofsky_score',
    'comorbidity_score_x_treatment_intensity',
    'comorbidity_treatment_intensity_fn',
    'compatibility_score',
    'compatibility_dri_age_fn',
    'compatibility_dri_comorbidity_fn',
    'compatibility_score_squared',
    'compatibility_score_x_clinical_status_score',
    'compatibility_score_x_conditioning_intensity_numeric',
    'compatibility_score_x_disease_control_factor',
    'compatibility_score_x_dri_age',
    'compatibility_score_x_dri_comorbidity',
    'compatibility_score_x_dri_quantile',
    'compatibility_score_x_treatment_intensity',
    'compatibility_treatment_intensity_fn',
    'condition_severity',
    'conditioning_group',
    'conditioning_intensity',
    'conditioning_intensity_age_bin',
    'conditioning_intensity_cyto',
    'conditioning_intensity_donor_age_bin',
    'conditioning_intensity_gvhd_proph',
    'conditioning_intensity_karnofsky',
    'conditioning_intensity_mrd_hct',
    'conditioning_intensity_numeric',
    'conditioning_intensity_numeric_squared',
    'conditioning_intensity_prim_disease',
    'conditioning_intensity_race_group',
    'conditioning_intensity_rituximab',
    'conditioning_intensity_tbi',
    'conditioning_intensity_year',
    'cyto_age',
    'cyto_score',
    'cyto_score_detail',
    'depletion_age_interaction',
    'depletion_comorbidity_interaction',
    'depletion_risk_interaction',
    'diabetes',
    'disease_control_factor',
    'disease_control_factor_squared',
    'disease_control_factor_x_clinical_status_score',
    'disease_control_factor_x_conditioning_intensity_numeric',
    'disease_control_factor_x_dri_quantile',
    'disease_status_group',
    'disease_status_risk_score',
    'donor_by_age_at_hct',
    'donor_age',
    'donor_age_bin',
    'donor_age_bin_related',
    'donor_related',
    'donor_age_squared',
    'donor_age_x_clinical_status_score',
    'donor_age_x_compatibility_score',
    'donor_age_x_conditioning_intensity_numeric',
    'donor_age_x_disease_control_factor',
    'donor_age_x_dri_age',
    'donor_age_x_dri_comorbidity',
    'donor_age_x_dri_numeric',
    'donor_age_x_dri_quantile',
    'donor_age_x_treatment_intensity',
    'dqb1_low_interaction',
    'drb1_dqb1_diff',
    'drb1_dqb1_ratio',
    'drb1_high_interaction',
    'dri_age',
    'dri_age_squared',
    'dri_age_x_clinical_status_score',
    'dri_age_x_conditioning_intensity_numeric',
    'dri_age_x_disease_control_factor',
    'dri_age_x_dri_comorbidity',
    'dri_age_x_dri_quantile',
    'dri_comorbidity',
    'dri_comorbidity_fn',
    'dri_comorbidity_squared',
    'dri_comorbidity_x_clinical_status_score',
    'dri_comorbidity_x_conditioning_intensity_numeric',
    'dri_comorbidity_x_disease_control_factor',
    'dri_comorbidity_x_dri_quantile',
    'dri_disease_status',
    'dri_ethnicity',
    'dri_karnofsky',
    'dri_num_comorbidity_fn',
    'dri_num_compatibility_fn',
    'dri_numeric',
    'dri_numeric_squared',
    'dri_numeric_x_clinical_status_score',
    'dri_numeric_x_compatibility_score',
    'dri_numeric_x_conditioning_intensity_numeric',
    'dri_numeric_x_disease_control_factor',
    'dri_numeric_x_dri_age',
    'dri_numeric_x_dri_comorbidity',
    'dri_numeric_x_dri_quantile',
    'dri_numeric_x_treatment_intensity',
    'dri_quantile',
    'dri_quantile_squared',
    'dri_quantile_x_conditioning_intensity_numeric',
    'dri_score',
    'dri_score_grouped',
    'dri_score_is_2',
    'ethnicity',
    'ethnicity_vivo',
    'FK_MMF_interaction',
    'graft_cmv_status',
    'graft_dri',
    'graft_dri_score_is_2',
    'graft_prim_disease',
    'graft_prod',
    'graft_risk_interaction',
    'graft_type',
    'gvhd_proph',
    'has_CSA',
    'has_FK',
    'has_MMF',
    'has_MTX',
    'has_combination',
    'has_cyclophosphamide',
    'hepatic_mild',
    'hepatic_severe',
    'high_low_diff',
    'high_risk_combination',
    'hla_high_low_ratio',
    'hla_high_res_10',
    'hla_high_res_6',
    'hla_high_res_8',
    'hla_high_res_mean',
    'hla_low_res_10',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_mean',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_c_high',
    'hla_match_c_low',
    'hla_match_dqb1_high',
    'hla_match_dqb1_low',
    'hla_match_dqb1_mean',
    'hla_match_drb1_high',
    'hla_match_drb1_low',
    'hla_match_drb1_mean',
    'hla_match_total',
    'hla_match_weighted',
    'hla_max',
    'hla_mean',
    'hla_min',
    'hla_mismatch_score',
    'hla_nmdp_6',
    'hla_ratio_res_highlow',
    'hla_ratio_res_lowhigh',
    'hla_std',
    'immune_burden',
    'in_vivo_tcd',
    'is_complex',
    'is_depletion_based',
    'is_experimental',
    'is_high_risk',
    'is_monotherapy',
    'is_standard_approach',
    'is_standard_risk',
    'karnofsky_above_80',
    'karnofsky_age_at_hct',
    'karnofsky_below_70',
    'karnofsky_category',
    'karnofsky_compatibility_fn',
    'karnofsky_deviation',
    'karnofsky_dri_age_fn',
    'karnofsky_dri_comorbidity_fn',
    'karnofsky_dri_fn',
    'karnofsky_hla_interaction',
    'karnofsky_risk_adjusted',
    'karnofsky_score',
    'karnofsky_score_squared',
    'karnofsky_score_x_clinical_status_score',
    'karnofsky_score_x_compatibility_score',
    'karnofsky_score_x_conditioning_intensity_numeric',
    'karnofsky_score_x_disease_control_factor',
    'karnofsky_score_x_donor_age',
    'karnofsky_score_x_dri_age',
    'karnofsky_score_x_dri_comorbidity',
    'karnofsky_score_x_dri_numeric',
    'karnofsky_score_x_dri_quantile',
    'karnofsky_score_x_treatment_intensity',
    'karnofsky_standardized',
    'karnofsky_treatment_intensity_fn',
    'karnofsky_weighted_performance',
    'melphalan_dose',
    'mrd_hct',
    'multiple_conditions',
    'n_agents',
    'no_prophylaxis',
    'obesity',
    'optimal_conditioning',
    'optimal_disease_status',
    'peptic_ulcer',
    'pre_post_2015',
    'prim_disease_hct',
    'primary_agent',
    'prior_tumor',
    'prod_type',
    'psych_disturb',
    'pulm_moderate',
    'pulm_severe',
    'race_group',
    'renal_issue',
    'rheum_issue',
    'risk_depletion_category',
    'risk_score_dri_weighted',
    'risk_score_equal',
    'rituximab',
    'rituximab_given_gvhd',
    'sex_donor',
    'sex_match',
    'sex_recipient',
    'tbi_status',
    'tce_div_match',
    'tce_imm_match',
    'tce_match',
    'transplant_era',
    'treatment_intensity',
    'treatment_intensity_dri_age_fn',
    'treatment_intensity_dri_comorbidity_fn',
    'treatment_intensity_squared',
    'treatment_intensity_x_clinical_status_score',
    'treatment_intensity_x_conditioning_intensity_numeric',
    'treatment_intensity_x_disease_control_factor',
    'treatment_intensity_x_dri_age',
    'treatment_intensity_x_dri_comorbidity',
    'treatment_intensity_x_dri_quantile',
    'vent_hist',
    'vivo_age_bin',
    'vivo_comorbidity',
    'vivo_prim_disease',
    'weighted_risk_score',
    'with_tbi',
    'year_hct',
    'years_since_2008'
]

# Get categorical columns from train features
CATS = train[FEATURES].select_dtypes(include=['object', 'category']).columns.tolist()

In [358]:
class LightGBMModel:
    def __init__(
        self,
        n_folds: int = 10,
        random_state: int = 42,
        n_trials: int = 50,
        best_params_path: str = 'best_lgb_na_params.json'
    ):
        self.n_folds = n_folds
        self.random_state = random_state
        self.n_trials = n_trials
        self.best_params_path = best_params_path
        self.best_params = None
        
    def objective(self, trial: optuna.Trial, train_data: pd.DataFrame, valid_data: pd.DataFrame,
                 features: list) -> float:
        """Optuna objective function for hyperparameter optimization."""
        param = {
            'device': 'gpu',
            'gpu_use_dp': True,
            'num_leaves': trial.suggest_int('num_leaves', 25, 63),
            'max_depth': trial.suggest_int('max_depth', 2, 9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.4),
            'subsample': trial.suggest_float('subsample', 0.2, 0.8),
            'n_estimators': trial.suggest_int('n_estimators', 400, 1400),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'min_child_samples': trial.suggest_int('min_child_samples', 40, 100),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.1),
            'objective': 'regression',
            'verbose': -1
        }
        
        model = LGBMRegressor(**param)
        model.fit(
            train_data[features], train_data['y_na'],
            eval_set=[(valid_data[features], valid_data['y_na'])]
        )
        
        predictions = model.predict(valid_data[features])
        
        # Create prediction DataFrame in required format
        y_true = valid_data[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = valid_data[["ID"]].copy()
        y_pred["prediction"] = predictions
        
        fold_score = score(y_true, y_pred, "ID")
        return fold_score
    
    def save_best_params(self, params: Dict) -> None:
        """Save the best parameters to a JSON file."""
        with open(self.best_params_path, 'w') as f:
            json.dump(params, f)
    
    def load_best_params(self) -> Optional[Dict]:
        """Load the best parameters from a JSON file if it exists."""
        if os.path.exists(self.best_params_path):
            with open(self.best_params_path, 'r') as f:
                return json.load(f)
        return None
    
    def train_and_predict(
        self,
        train: pd.DataFrame,
        test: pd.DataFrame,
        features: list,
        tune_hyperparameters: bool = False
    ) -> Tuple[np.ndarray, np.ndarray, float, LGBMRegressor]:
        """
        Train the model and make predictions, with optional hyperparameter tuning.
        
        Args:
            train: Training DataFrame
            test: Test DataFrame
            features: List of feature columns
            tune_hyperparameters: Whether to perform hyperparameter tuning
            
        Returns:
            Tuple containing:
            - pred_lgb: Predictions for test set
            - oof_lgb: Out-of-fold predictions for training set
            - lgb_score: Model score
            - best_model: Best trained model
        """
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        oof_lgb_na = np.zeros(len(train))
        pred_lgb_na = np.zeros(len(test))
        best_model = None

        if tune_hyperparameters:
            # Perform hyperparameter tuning on first fold
            print("Starting hyperparameter tuning...")
            train_idx, valid_idx = next(kf.split(train))
            
            # Create proper DataFrame splits for tuning
            train_fold = train.iloc[train_idx].copy()
            valid_fold = train.iloc[valid_idx].copy()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, train_fold, valid_fold, features),
                n_trials=self.n_trials
            )
            
            self.best_params = study.best_params
            # Add fixed parameters
            self.best_params.update({
                'device': 'gpu',
                'gpu_use_dp': True,
                'objective': 'regression'
            })
            self.save_best_params(self.best_params)
            print(f"Best parameters: {self.best_params}")
        else:
            self.best_params = self.load_best_params()
            if self.best_params is None:
                print("No saved parameters found. Using default parameters.")
                self.best_params = {
                    'device': 'gpu',
                    'num_leaves': 51,
                    'max_depth': 5,
                    'colsample_bytree': 0.21335213389258528,
                    'subsample': 0.20066896717181626,
                    'n_estimators': 1177,
                    'learning_rate': 0.017497134941706616,
                    'min_child_samples': 77,
                    'reg_alpha': 0.07150358714370106,
                    'reg_lambda': 0.09214762014336535,
                    'objective': 'regression',
                    'gpu_use_dp': True
                }

        # Train the model with best parameters
        for i, (train_idx, valid_idx) in enumerate(kf.split(train)):
            print(f"Training fold {i+1}/{self.n_folds}")
            
            x_train = train.iloc[train_idx][features]
            y_train = train.iloc[train_idx]['y_na']
            x_valid = train.iloc[valid_idx][features]
            y_valid = train.iloc[valid_idx]['y_na']
            x_test = test[features]
            
            model = LGBMRegressor(**self.best_params)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)]
            )
            
            if i == 0:
                best_model = model

            oof_lgb_na[valid_idx] = model.predict(x_valid)
            pred_lgb_na += model.predict(x_test)

        pred_lgb_na /= self.n_folds
        
        # Calculate final score
        y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_lgb_na
        lgb_na_score = score(y_true.copy(), y_pred.copy(), "ID")
        print(f"\nOverall CV Score: {lgb_na_score}")
        
        return pred_lgb_na, oof_lgb_na, lgb_na_score, best_model

In [None]:
# Initialize the model
lgb_na_model = LightGBMModel(n_folds=10, n_trials=400)

# For hyperparameter tuning:
pred_lgb_na, oof_lgb_na, lgb_na_score, model_lgb_na = lgb_na_model.train_and_predict(
    train, test, FEATURES, tune_hyperparameters=False
)

In [None]:
feature_importance = model_lgb_na.feature_importances_ 
importance_df = pd.DataFrame({
    "Feature": FEATURES,
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)

# Create dictionary for JSON
importance_dict = {
   "score": lgb_na_score,
   "feature_importance": importance_df.to_dict('records')
}

# Save to JSON
with open(f'{output_path}/lightgbm_na_feature_importance_{lgb_na_score}.json', 'w') as f:
   json.dump(importance_dict, f, indent=4)

plt.figure(figsize=(22, 30))
plt.barh(importance_df["Feature"], importance_df["Importance"], color='skyblue')
plt.xlabel("Importance (Gain)")
plt.ylabel("Feature")
plt.title("LightGBM Nelson Aalen Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.tight_layout()
plt.savefig(f'{output_path}/lightgbm_na_feature_importance_{lgb_na_score}.png')
plt.show()

In [None]:
print(f"\nxgb score =",xgb_score)
print(f"\ncat score =",cat_score)
print(f"\nlgb score =",lgb_score)
print(f"\nxgb cox score =",xgb_cox_score)
print(f"\ncat cox score =",cat_cox_score)
print(f"\nlgb cox score =",lgb_cox_score)
print(f"\nxgb na score =",xgb_na_score)
print(f"\ncat na score =",cat_na_score)
print(f"\nlgb na score =",lgb_na_score)

In [362]:
# add lgb_cox, xgb_na, cat_na, lgb_na
oof_preds = [
    #oof_xgb, #km
    oof_cat, #km
    #oof_lgb, #km
    oof_xgb_cox, #cox efstime2
    oof_cat_cox, #cox efstime2
    #oof_lgb_cox, #cox efstime2
    #oof_xgb_na, #na
    oof_cat_na, #na
    #oof_lgb_na #na
]

weights = [1.0, 1.0, 1.0, 1.0]

In [363]:
ranked_oof_preds = np.array([rankdata(p) for p in oof_preds])

In [364]:
ensemble_oof_preds = np.sum([w * p for w, p in zip(weights, ranked_oof_preds)], axis=0)

In [None]:
# Ensemble CAT and XGB and LGB
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = ensemble_oof_preds
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

In [366]:
preds = [
    pred_xgb,
    pred_cat,
    #pred_lgb,
    pred_xgb_cox,
    pred_cat_cox,
    #pred_lgb_cox,
    pred_xgb_na,
    pred_cat_na,
    #pred_lgb_na
]

In [367]:
ranked_preds = np.array([rankdata(p) for p in preds])

In [368]:
ensemble_preds = np.sum([w * p for w, p in zip(weights, ranked_preds)], axis=0)

In [None]:
sub = pd.read_csv("input/data/sample_submission.csv")

# Print individual model rankings with np.set_printoptions for full output
np.set_printoptions(threshold=np.inf)

# Combine rankings
sub.prediction = ensemble_preds

print("\nFinal ensemble scores:")
print(sub.prediction)

sub.to_csv("submission.csv", index=False)
print("\nSub shape:", sub.shape)
sub.head()