In [3]:
DATA_DIR = 'equity-post-HCT-survival-predictions-1'


In [1]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [4]:
#train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
#test_data  = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

train_data = pd.read_csv(DATA_DIR +'/train.csv')
test_data  = pd.read_csv(DATA_DIR +'/test.csv')

In [5]:
from lifelines import KaplanMeierFitter

# Function to calculate Kaplan-Meier survival probabilities
def calculate_survival_probabilities(df, time_col, event_col):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    return kmf.survival_function_at_times(df[time_col]).values

# Preprocess the dataset
def preprocess_survival_data(df, time_col='efs_time', event_col='efs'):
    df['target'] = calculate_survival_probabilities(df, time_col, event_col)
    # df.loc[df[event_col] == 0, 'target'] -= 0.2  # Adjust for censored data
    # df['efs_time2'] = df[time_col].where(df[event_col] == 1, -df[time_col])  # Negative for censored
    return df

# Apply preprocessing
df = preprocess_survival_data(train_data)

In [6]:
# Add an indicator column
train_data['Dataset'] = 'train'
test_data['Dataset'] = 'test'

# Concatenate train and test
df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [12]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Display results
print("Numerical Columns:")
print(numerical_columns)

print("\nCategorical Columns:")
print(categorical_columns)

Numerical Columns:
Index(['ID', 'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6',
       'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high',
       'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age',
       'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high',
       'comorbidity_score', 'karnofsky_score', 'hla_low_res_8',
       'hla_match_drb1_high', 'hla_low_res_10', 'efs', 'efs_time', 'target'],
      dtype='object')

Categorical Columns:
Index(['Dataset'], dtype='object')


In [13]:
# Handling missing values for categorical columns
for column in categorical_columns:
    # Replace missing values in categorical columns with 'unknown'
    df[column].fillna('unknown', inplace=True)

In [14]:
categories = ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
       'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
       'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
       'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
       'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
       'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
       'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'cardiac', 'pulm_moderate']

In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()

# Fit and transform
for cat in categories:
    df[cat] = label_encoder.fit_transform(df[cat])
    df[cat] = df[cat].astype('category')

In [16]:
# Split the concatenated dataset back into the original train and test datasets
# Retain relevant columns while dropping unnecessary ones
train_data = df[df['Dataset'] == 'train'].drop(columns=['Dataset', 'ID'])  # Train set without metadata
test_data = df[df['Dataset'] == 'test'].drop(columns=['Dataset', 'efs', 'efs_time', 'target'])  # Test set without target-related columns

# Separate features (X) and target (y) for the training set
X = train_data.drop(columns=['efs', 'efs_time', 'target'])  # Feature columns for training
y = train_data[['target']]  # Target variable for training

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from lifelines.utils import concordance_index

# Separate numerical and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features)
    ])

# Create pipeline with preprocessing and model
model_pipeline = Pipeline([
    ('imputer', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=1000,
        max_depth=6,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    ))
])

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store fold-specific results
fold_scores = []  # RMSE for each fold
fold_c_indices = []  # C-index for each fold

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    # Split the dataset into training and validation sets for the current fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Fit the pipeline and make predictions
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_val)

    # Evaluate RMSE (Root Mean Squared Error)
    fold_score = mean_squared_error(y_val, y_pred, squared=False)
    fold_scores.append(fold_score)

    # Calculate Concordance Index (C-index) for survival analysis
    c_index = concordance_index(y_val, y_pred)
    fold_c_indices.append(c_index)

    # Print metrics for the current fold
    print(f"\nFold {fold} Results:")
    print(f"RMSE: {fold_score:.4f}")
    print(f"C-index: {c_index:.4f}")

# Summary of cross-validation results
print("\nOverall Results:")
print(f"Mean RMSE: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"Mean C-index: {np.mean(fold_c_indices):.4f} ± {np.std(fold_c_indices):.4f}")

# If you want to see feature importances after training on all data
final_model = model_pipeline.fit(X, y)
feature_importance = pd.DataFrame({
    'feature': numeric_features.tolist() + categorical_features.tolist(),
    'importance': final_model.named_steps['regressor'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Fold 1 Results:
RMSE: 0.1629
C-index: 0.6197

Fold 2 Results:
RMSE: 0.1646
C-index: 0.6227

Fold 3 Results:
RMSE: 0.1632
C-index: 0.6230

Fold 4 Results:
RMSE: 0.1646
C-index: 0.6141

Fold 5 Results:
RMSE: 0.1649
C-index: 0.6147

Overall Results:
Mean RMSE: 0.1640 ± 0.0008
Mean C-index: 0.6189 ± 0.0038

Top 10 Most Important Features:
                   feature  importance
22               dri_score    0.338414
38  conditioning_intensity    0.203659
17       comorbidity_score    0.153233
10                year_hct    0.071567
18         karnofsky_score    0.054577
24              cyto_score    0.024289
12               donor_age    0.021703
41                 mrd_hct    0.016722
32        prim_disease_hct    0.014838
14              age_at_hct    0.013956


In [21]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np
from lifelines.utils import concordance_index
import pandas as pd

def prepare_categorical_features(X):
    """Prepare categorical features using Label Encoding"""
    X_prepared = X.copy()
    label_encoders = {}

    for column in X.select_dtypes(include=['category']).columns:
        label_encoders[column] = LabelEncoder()
        X_prepared[column] = label_encoders[column].fit_transform(X_prepared[column].astype(str))

    return X_prepared, label_encoders

# Approach 1: Using Label Encoding
def train_xgboost_label_encoded():
    # Prepare data
    X_prepared, _ = prepare_categorical_features(X)

    # Initialize K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Lists to store fold-specific results
    fold_scores = []
    fold_c_indices = []

    # XGBoost parameters
    xgb_params = {
        'objective': 'reg:squarederror',
        'max_depth': 6,
        'learning_rate': 0.05,
        'n_estimators': 1000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42
    }

    for fold, (train_index, val_index) in enumerate(kf.split(X_prepared), 1):
        # Split the dataset
        X_train, X_val = X_prepared.iloc[train_index], X_prepared.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Train XGBoost model
        model = xgb.XGBRegressor(**xgb_params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        # Make predictions
        y_pred = model.predict(X_val)

        # Calculate metrics
        fold_score = mean_squared_error(y_val, y_pred, squared=False)
        fold_scores.append(fold_score)

        c_index = concordance_index(y_val, y_pred)
        fold_c_indices.append(c_index)

        print(f"\nFold {fold} Results:")
        print(f"RMSE: {fold_score:.4f}")
        print(f"C-index: {c_index:.4f}")

    print("\nXGBoost Results (Label Encoded Features):")
    print(f"Mean RMSE: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
    print(f"Mean C-index: {np.mean(fold_c_indices):.4f} ± {np.std(fold_c_indices):.4f}")

    # Feature importance
    final_model = xgb.XGBRegressor(**xgb_params)
    final_model.fit(X_prepared, y)
    importance_df = pd.DataFrame({
        'feature': X_prepared.columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

    return np.mean(fold_scores)

# Approach 2: Using Native Categorical Support (requires newer XGBoost version)
def train_xgboost_native_categorical():
    try:
        # Initialize K-Fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Lists to store fold-specific results
        fold_scores = []
        fold_c_indices = []

        # XGBoost parameters with enable_categorical=True
        xgb_params = {
            'objective': 'reg:squarederror',
            'max_depth': 6,
            'learning_rate': 0.05,
            'n_estimators': 1000,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'enable_categorical': True
        }

        for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
            # Split the dataset
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Train XGBoost model
            model = xgb.XGBRegressor(**xgb_params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)],
                verbose=False
            )

            # Make predictions
            y_pred = model.predict(X_val)

            # Calculate metrics
            fold_score = mean_squared_error(y_val, y_pred, squared=False)
            fold_scores.append(fold_score)

            c_index = concordance_index(y_val, y_pred)
            fold_c_indices.append(c_index)

            print(f"\nFold {fold} Results:")
            print(f"RMSE: {fold_score:.4f}")
            print(f"C-index: {c_index:.4f}")

        print("\nXGBoost Results (Native Categorical):")
        print(f"Mean RMSE: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
        print(f"Mean C-index: {np.mean(fold_c_indices):.4f} ± {np.std(fold_c_indices):.4f}")
        return np.mean(fold_scores)
    except Exception as e:
        print("Native categorical support failed. Your XGBoost version might not support it.")
        print(f"Error: {str(e)}")
        return None

# Try both approaches
print("Training with Label Encoded features...")
encoded_rmse = train_xgboost_label_encoded()

print("\nTrying Native Categorical support...")
native_cat_rmse = train_xgboost_native_categorical()

print("\nFinal Comparison:")
print(f"Label Encoded RMSE: {encoded_rmse:.4f}")
if native_cat_rmse is not None:
    print(f"Native Categorical RMSE: {native_cat_rmse:.4f}")

Training with Label Encoded features...

Fold 1 Results:
RMSE: 0.1571
C-index: 0.6433

Fold 2 Results:
RMSE: 0.1570
C-index: 0.6490

Fold 3 Results:
RMSE: 0.1569
C-index: 0.6461

Fold 4 Results:
RMSE: 0.1585
C-index: 0.6380

Fold 5 Results:
RMSE: 0.1580
C-index: 0.6426

XGBoost Results (Label Encoded Features):
Mean RMSE: 0.1575 ± 0.0006
Mean C-index: 0.6438 ± 0.0037

Top 10 Most Important Features:
                   feature  importance
26  conditioning_intensity    0.042109
0                dri_score    0.035516
46       comorbidity_score    0.028024
47         karnofsky_score    0.024304
25       cyto_score_detail    0.023847
30                 mrd_hct    0.023382
28                year_hct    0.022835
12             pulm_severe    0.022195
43               sex_match    0.020454
7               arrhythmia    0.019967

Trying Native Categorical support...

Fold 1 Results:
RMSE: 0.1556
C-index: 0.6470

Fold 2 Results:
RMSE: 0.1574
C-index: 0.6466

Fold 3 Results:
RMSE: 0.1577
C-index:

In [28]:
final_model

In [22]:
# Preprocess test data
test_features = test_data.drop(columns=['ID'], errors='ignore')  # Drop unnecessary columns

In [25]:
# Predict using the trained CatBoost model
test_data['prediction'] = final_model.predict(test_features)

In [26]:
test_data['prediction']

28800    0.504417
28801    0.642748
28802    0.504220
Name: prediction, dtype: float64

In [27]:
# Create submission file
submission = test_data[['ID', 'prediction']]  # Include 'id' and the predicted target column
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
