In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import wandb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from xgboost import XGBClassifier
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# load the data
raw_df_train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
raw_df_test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")

In [3]:
# differentiate the categorical and numerical cols
def identify_numerical_catergorical_columns(df):
    """
    Identify the Numerical and the Categorical Columns in the dataframe
    """
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

    return numerical_cols, categorical_cols

In [29]:
# create data pre procesing pipeline
def get_data_transformer_object(numerical_cols, categorical_cols):
    num_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                    ('standar_scaler', StandardScaler())
                    
                ]
            )

    cat_pipeline = Pipeline(
                steps = [
                    ('imputer', SimpleImputer(strategy='constant', fill_value='other')),
                    ("label_encoder", OrdinalEncoder())
                ]
            )

    preprocessor = ColumnTransformer(
                [
                    ("num", num_pipeline, numerical_cols),
                    ("cat", cat_pipeline, categorical_cols)
                ]
            )

    return preprocessor

In [30]:
# final data creation step
target = 'efs'
df_features = raw_df_train.drop(columns=[target, 'ID', 'efs_time'], axis = 1)
print("df_features shape : {}".format(df_features.shape))


df_label  = np.array(raw_df_train[target]).reshape(-1, 1)
print("Train Label Shape  : {}".format(df_label.shape))

numerical_cols, categorical_cols = identify_numerical_catergorical_columns(df_features)
print("----- Numerical columns -------")
print(numerical_cols)
print("----- Categorical columns -------")
print(categorical_cols)
preprocesser_obj = get_data_transformer_object(numerical_cols, categorical_cols)

processed_df_arr = preprocesser_obj.fit_transform(df_features)

X_train, X_valid, y_train, y_valid = train_test_split(processed_df_arr, df_label, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_valid.shape)

df_features shape : (28800, 57)
Train Label Shape  : (28800, 1)
----- Numerical columns -------
['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
----- Categorical columns -------
['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_mat

In [31]:
# create model instance
from sklearn.model_selection import cross_val_score, StratifiedKFold 
import gc
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
validation_score_arr = np.zeros(X_train.shape[0])
xgboost_model = XGBClassifier(
    n_estimators=100,       # Number of trees
    learning_rate=0.1,      # Step size shrinkage
    max_depth=4,            # Maximum tree depth
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',# Evaluation metric
    enable_categorical=True,
    random_state=42,
    tree_method='hist',
    device='cuda',
    early_stopping_rounds=10,
)


for fold_index, (train_index,val_index) in enumerate(folds.split(X_train,y_train)):
    print("Train Index : ", type(train_index))
    print("Validation Index : ", type(val_index))
    print('Batch {} started...'.format(fold_index))
    gc.collect()
    bst = xgboost_model.fit(X_train[train_index],y_train[train_index],
              eval_set = [(X_train[val_index],y_train[val_index])],
              verbose= 200
              )

    validation_score_arr[val_index] = xgboost_model.predict_proba(X_train[val_index])[:, 1]
    print(validation_score_arr[val_index])
    validation_score_arr[val_index] = (validation_score_arr[val_index] >= 0.5).astype(int)
    accuracy = accuracy_score(y_train[val_index], validation_score_arr[val_index])
    print("Accuracy_score : {}".format(accuracy))
"""
# fit model
xgboost_baseline_model.fit(X_train, y_train)
# make predictions
preds = xgboost_baseline_model.predict(X_valid)
y_pred_proba = xgboost_baseline_model.predict_proba(X_valid)[:, 1] 
accuracy_score(y_valid, preds)
"""

Train Index :  <class 'numpy.ndarray'>
Validation Index :  <class 'numpy.ndarray'>
Batch 0 started...
[0]	validation_0-logloss:0.67868
[99]	validation_0-logloss:0.58867
[0.67874253 0.6348446  0.58607972 ... 0.65730953 0.67232025 0.53507066]
Accuracy_score : 0.6869791666666667
Train Index :  <class 'numpy.ndarray'>
Validation Index :  <class 'numpy.ndarray'>
Batch 1 started...
[0]	validation_0-logloss:0.67890
[99]	validation_0-logloss:0.59166
[0.33418384 0.7266981  0.64476806 ... 0.77162361 0.59264064 0.67738056]
Accuracy_score : 0.6829427083333334
Train Index :  <class 'numpy.ndarray'>
Validation Index :  <class 'numpy.ndarray'>
Batch 2 started...
[0]	validation_0-logloss:0.67855
[99]	validation_0-logloss:0.59539
[0.71849811 0.69594926 0.70964676 ... 0.72905672 0.56391186 0.33038661]
Accuracy_score : 0.6825520833333333


'\n# fit model\nxgboost_baseline_model.fit(X_train, y_train)\n# make predictions\npreds = xgboost_baseline_model.predict(X_valid)\ny_pred_proba = xgboost_baseline_model.predict_proba(X_valid)[:, 1] \naccuracy_score(y_valid, preds)\n'

In [22]:
test_df = raw_df_test.drop(columns = ['ID'], axis = 1)
numerical_cols_test, categorical_cols_test = identify_numerical_catergorical_columns(df_features)
print(numerical_cols)
print(categorical_cols_test)
preprocesser_obj_test = get_data_transformer_object(numerical_cols_test, categorical_cols_test)
processed_test_arr = preprocesser_obj_test.fit_transform(test_df)
processed_test_arr.shape

['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


(3, 57)

In [26]:
submission_prob = np.max(xgboost_model.predict_proba(processed_test_arr), axis = 1)
print(submission_prob)
submission_df = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
submission_df.head()
submission_df["prediction"] = submission_prob
submission_df.to_csv("/kaggle/working/submission.csv")

[0.7677845  0.73663646 0.7753409 ]
