In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
"""
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

"\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n"

# Preparation Steps of Data for Performing Baseline calculation

In [4]:
# load the data
raw_df_train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
raw_df_test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")

In [5]:
# differentiate the categorical and numerical cols
def identify_numerical_catergorical_columns(df):
    """
    Identify the Numerical and the Categorical Columns in the dataframe
    """
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

    return numerical_cols, categorical_cols

In [6]:
# create data pre procesing pipeline
def get_data_transformer_object(numerical_cols, categorical_cols):
    num_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
                    
                ]
            )

    cat_pipeline = Pipeline(
                steps = [
                    ('imputer', SimpleImputer(strategy='constant', fill_value='other')),
                    ("label_encoder", OrdinalEncoder())
                ]
            )

    preprocessor = ColumnTransformer(
                [
                    ("num", num_pipeline, numerical_cols),
                    ("cat", cat_pipeline, categorical_cols)
                ]
            )

    return preprocessor

In [7]:
# final data creation step
target = 'efs'
df_features = raw_df_train.drop(columns=[target, 'ID', 'efs_time'], axis = 1)
print("df_features shape : {}".format(df_features.shape))


df_label  = np.array(raw_df_train[target]).reshape(-1, 1)
print("Train Label Shape  : {}".format(df_label.shape))

numerical_cols, categorical_cols = identify_numerical_catergorical_columns(df_features)
print("----- Numerical columns -------")
print(numerical_cols)
print("----- Categorical columns -------")
print(categorical_cols)
preprocesser_obj = get_data_transformer_object(numerical_cols, categorical_cols)

processed_df_arr = preprocesser_obj.fit_transform(df_features)

X_train, X_valid, y_train, y_valid = train_test_split(processed_df_arr, df_label, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_valid.shape)

df_features shape : (28800, 57)
Train Label Shape  : (28800, 1)
----- Numerical columns -------
['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
----- Categorical columns -------
['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_mat

# Baseline for XGBoost Classifier

In [20]:
from xgboost import XGBClassifier
# create model instance
xgboost_baseline_model = XGBClassifier(
    n_estimators=100,       # Number of trees
    learning_rate=0.1,      # Step size shrinkage
    max_depth=4,            # Maximum tree depth
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',# Evaluation metric
    enable_categorical=True,
    random_state=42
)
# fit model
xgboost_baseline_model.fit(X_train, y_train)
# make predictions
preds = xgboost_baseline_model.predict(X_valid)
y_pred_proba = xgboost_baseline_model.predict_proba(X_valid)[:, 1] 
accuracy_score(y_valid, preds)

0.6821180555555556

**Here From the XGBoost Classifier the baseline aaccuracy score is 0.6821180555555556**

In [None]:
test_df = raw_df_test.drop(columns = ['ID'], axis = 1)
numerical_cols_test, categorical_cols_test = identify_numerical_catergorical_columns(df_features)
print(numerical_cols)
print(categorical_cols_test)
preprocesser_obj_test = get_data_transformer_object(numerical_cols_test, categorical_cols_test)
processed_test_arr = preprocesser_obj_test.fit_transform(test_df)
processed_test_arr.shape

In [None]:
submission_prob = np.max(xgboost_baseline_model.predict_proba(processed_test_arr), axis = 1)
print(submission_prob)

In [None]:
submission_df = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
submission_df.head()
submission_df["prediction"] = submission_prob
submission_df.to_csv("/kaggle/working/submission.csv")

# Baseline for LightGBM

In [2]:
import lightgbm as lgb

In [13]:
# create the lighgbm dataset
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_valid, label=y_valid)

In [14]:
# defining the LightGBM parameters
params = {
    "objective": "binary",
    "metric": "binary_logloss",  # Use "auc" for area under the curve metric
    "boosting_type": "gbdt",    # Gradient Boosted Decision Trees
    "learning_rate": 0.1,
    "num_leaves": 31,
    "max_depth": -1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

In [16]:
# Train the model
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets = [validation_data],
    callbacks = [
        lgb.early_stopping(stopping_rounds = 50)
    ]
)



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.588832


In [17]:
model.best_iteration

125

In [18]:
y_pred_prob = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)
print(y_pred)

[1 1 1 ... 0 1 1]


In [19]:
evaluation_score = accuracy_score(y_valid, y_pred)
print("Evaluation score is {}".format(evaluation_score))

Evaluation score is 0.6875


**From the LightGBM classifier the Baseline score is 0.6875**