In [279]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder #, TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgbm
from lightgbm import LGBMClassifier
import optuna.integration.lightgbm as lgb
from optuna.integration.lightgbm import LightGBMTunerCV, LightGBMTuner

warnings.filterwarnings('ignore')

In [280]:
class ModelType:
    LGBM = "LGBM"
    XGB = "XGB"
    RF = "RF"
    LR = "LR"
    CATBOOST = "CATBOOST"

In [281]:
class Config:
    RANDOM_SEED = 42
    # Number of target classes
    NUM_CLASSES = 3
    NUM_FOLDS = 5
    TARGET_COL_NAME = "outcome"    
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = ModelType.RF

DATA_PATH = "./data/"

In [282]:
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')

In [283]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME, num_folds=Config.NUM_FOLDS)
df_train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,kfold
0,753,no,adult,535381,39.4,86.0,21.0,normal,normal,pale_pink,...,75.0,cloudy,2.0,yes,3205,0,0,no,euthanized,3
1,582,yes,adult,535029,37.5,112.0,12.0,cold,normal,bright_pink,...,57.0,serosanguious,2.0,yes,4205,0,0,no,euthanized,0
2,548,yes,adult,529461,38.5,72.0,44.0,cool,reduced,bright_red,...,8.6,cloudy,4.3,yes,2112,0,0,yes,died,1
3,113,yes,adult,534157,38.4,40.0,16.0,cool,reduced,pale_pink,...,77.0,serosanguious,2.0,yes,2209,0,0,no,euthanized,3
4,174,yes,adult,529777,38.9,40.0,24.0,normal,normal,pale_pink,...,6.0,clear,5.4,yes,2206,0,0,yes,lived,1


In [284]:
# Get the count of each column type
df_train.dtypes.value_counts()

object     17
float64     7
int64       6
dtype: int64

In [285]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()
# remove target "outcome" from the list cols_str
cols_str.remove(Config.TARGET_COL_NAME)

In [286]:
print(cols_float)

['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein']


In [287]:
# print the list cols_str with 5 words per line
def print_list_cols(cols_str):
    for i in range(0, len(cols_str), 5):
        print(cols_str[i:i+5])

print_list_cols(cols_str)    

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane']
['capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube']
['nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion']
['cp_data']


In [288]:
# For each categorical feature, calculate distinct categories and their counts
def get_category_summary(df):
    # Initialize an empty DataFrame to store the results
    category_summary_list = []
    # Loop through columns to identify categorical features
    for column in df.columns:
        if df[column].dtype == 'object':
            # For categorical features, calculate distinct categories and their counts        
            cat_val_cnt = df[column].value_counts()        
            # create a dataframe for this specific categorical feature, distinct categories and their count
            cat_feature_df = pd.DataFrame(data={
                'Feature': [column] * len(cat_val_cnt),
                'Distinct_Categories': cat_val_cnt.index.values.tolist(), 
                'Category_Count': cat_val_cnt.values.tolist()
            })
            # Append the results to the categorsummary DataFrame list
            category_summary_list.append(cat_feature_df)
    
    category_summary = pd.concat(category_summary_list)
    return category_summary

In [289]:
df_categories_train = get_category_summary(df_train)
df_categories_test = get_category_summary(df_test)

In [290]:
train_cat_notin_test = df_categories_train[["Feature", "Distinct_Categories"]].merge(
                            df_categories_test[["Feature", "Distinct_Categories"]], 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

train_cat_notin_test = df_categories_train.merge(
                            df_categories_test, 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

# Find rows present in df2 but missing in df1
test_cat_notin_train = df_categories_test[["Feature", "Distinct_Categories"]].merge(
                            df_categories_train[["Feature", "Distinct_Categories"]], 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

# # Drop the '_merge' column and reset the index
train_cat_notin_test = train_cat_notin_test.drop(columns=['_merge']).reset_index(drop=True)
test_cat_notin_train = test_cat_notin_train.drop(columns=['_merge']).reset_index(drop=True)

In [291]:
# drop the row from test which has category not present in train
# df_test = df_test.drop(df_test[df_test.pain == "moderate"].index)

In [292]:
cols_to_leave = ["id", "kfold", Config.TARGET_COL_NAME, Config.TARGET_COL_NAME + "_encoded"]
col_names = [item for item in df_train.columns.values.tolist() if item not in cols_to_leave]
print(f"len(col_names)={len(col_names)}")        
# get all columns from df_train that are not of type float
noncont_col_names = [item for item in col_names if item not in cols_float]
cont_col_names = [item for item in cols_float if item not in cols_to_leave]
cat_col_names = [item for item in cols_str if item not in cols_to_leave]
print(f"len(cont_col_names)={len(cont_col_names)}, len(cat_col_names)={len(cat_col_names)}, len(noncont_col_names)={len(noncont_col_names)}")      

len(col_names)=27
len(cont_col_names)=7, len(cat_col_names)=16, len(noncont_col_names)=20


#### Encoding Categorical Columns using different strategies

Encoding: For lightGBM categorical features must be encoded as non-negative integers (int) less than Int32.MaxValue (2147483647) 1. This means that you cannot use string or float values for categorical features. You can use various encoding methods, such as label encoding, ordinal encoding, or frequency encoding, to convert categorical values into numeric codes. However, you should avoid using one-hot encoding, as it may reduce the performance and accuracy of LightGBM

In [293]:
# # Ordinal encoding of categorical features 
# from sklearn.preprocessing import OrdinalEncoder

# ordinal_encoder = OrdinalEncoder()
# ordinal_encoder = ordinal_encoder.fit(df_train[cat_col_names])
# encoded_cat_cols = ordinal_encoder.transform(df_train[cat_col_names])
# other_col_names = [item for item in df_train.columns.values.tolist() if item not in cat_col_names]
# df_train_cat = pd.DataFrame(encoded_cat_cols, columns=cat_col_names)
# df_train_other = df_train[other_col_names]
# df_train_processed = pd.concat([df_train_other, df_train_cat], axis=1)

In [294]:
one_hot_enc = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False, min_frequency=5)
one_hot_enc = one_hot_enc.fit(df_train[cat_col_names])
encoded_train = one_hot_enc.transform(df_train[cat_col_names])
encoded_test = one_hot_enc.transform(df_test[cat_col_names])
print(f"encoded_train.shape = {encoded_train.shape}")
print(f"encoded_test.shape = {encoded_test.shape}")
df_train.drop(columns=cat_col_names, inplace=True)
df_test.drop(columns=cat_col_names, inplace=True)
# drop the categorical columns from df_train
df_train_oh = pd.DataFrame(encoded_train, columns=one_hot_enc.get_feature_names_out())
df_test_oh = pd.DataFrame(encoded_test, columns=one_hot_enc.get_feature_names_out())
# append the one hot encoded data to df_train
df_train_processed = pd.concat([df_train, df_train_oh], axis=1)
df_test_processed = pd.concat([df_test, df_test_oh], axis=1)

encoded_train.shape = (1235, 72)
encoded_test.shape = (824, 72)


#### Label Encoding of target

In [295]:
# Encode the target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(df_train[Config.TARGET_COL_NAME])
# add the target_encoded as a new column to the dataframe
df_train_processed[Config.TARGET_COL_NAME + "_encoded"] = target_encoded
df_train[Config.TARGET_COL_NAME + "_encoded"] = target_encoded

In [296]:
tgt_proba_cols = [Config.TARGET_COL_NAME + "_proba_" + tgt_cls for tgt_cls in label_encoder.classes_]
tgt_proba_cols

['outcome_proba_died', 'outcome_proba_euthanized', 'outcome_proba_lived']

In [297]:
def normalize_data(df, cont_col_names, cols_to_leave):
    # normalize continuous features
    scaler = StandardScaler()
    X_cont = df[cont_col_names]    
    X_cont_scaled = scaler.fit_transform(X_cont)     
    # get the columns other than continuous features
    other_col_names = [item for item in df.columns.values.tolist() if item not in cont_col_names + cols_to_leave]
    # combine the normalized continuous features with others
    X_processed = np.concatenate([X_cont_scaled, df[other_col_names]], axis=1)    
    return X_processed

In [298]:
def get_fold_data(fold, df, cols_to_leave, target_col_name):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    col_names = [item for item in df_train_processed.columns.values.tolist() if item not in cols_to_leave]
    X_train = df_train[col_names]
    y_train = df_train[target_col_name]
    X_val = df_val[col_names]
    y_val = df_val[target_col_name]
    return X_train, y_train, X_val, y_val 

In [299]:
def run_training(train_df, train_y, val_df, val_y, params=None, callbacks=None):
    col_names = [item for item in train_df.columns.values.tolist() if item not in cols_to_leave]
    train_data = lgbm.Dataset(
            data=train_df[col_names], label=train_y, feature_name=col_names#, 
            #categorical_feature=cat_col_names
        )
    val_data = lgbm.Dataset(
            data=val_df[col_names], label=val_y, feature_name=col_names, 
            #categorical_feature=cat_col_names, 
            reference=train_data
        )    
    if callbacks is not None:        
        model = lgbm.train(
                    params,
                    train_set=train_data,                
                    valid_sets=val_data,
                    verbose_eval=-1,
                    callbacks=callbacks
                )
    else:
        model = lgbm.train(
                    params,
                    train_set=train_data,                
                    valid_sets=val_data,
                    verbose_eval=-1
                )       
    val_pred_probs = model.predict(val_df, num_iteration=model.best_iteration)        
    val_preds = np.argmax(val_pred_probs, axis=1)
    f1 = f1_score(val_y, val_preds, average="micro")
    return f1, model, val_pred_probs    

In [300]:
from lightgbm import early_stopping
from lightgbm import log_evaluation

def tune_params(train_df, train_y, params=None):
    col_names = [item for item in train_df.columns.values.tolist() if item not in cols_to_leave]
    train_data = lgbm.Dataset(
            data=train_df[col_names], label=train_y, feature_name=col_names#, 
            #categorical_feature=Config.CATEGORICAL_COLS
        )    
    lgbmtuner_cv = LightGBMTunerCV(
        params,
        train_set=train_data,        
        stratified=True,
        shuffle=True,
        nfold=Config.NUM_FOLDS,
        verbose_eval=-1,
        callbacks=[early_stopping(100), log_evaluation(100)]
    ) 
    lgbmtuner_cv.run()                
    print("Best Params: ", lgbmtuner_cv.best_params)    
    print("Best score: ", lgbmtuner_cv.best_score)    
    return lgbmtuner_cv    

In [301]:
# params = {
#         "objective": "multiclass",
#         "num_class": Config.NUM_CLASSES,
#         "metric": "softmax",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#     }

# train_y = df_train_processed[Config.TARGET_COL_NAME+"_encoded"]
# tuned_model = tune_params(df_train_processed, train_y, params)

In [302]:
lgbm_params = {
    'objective': 'multiclass', 
    'num_class': 3, 
    'metric': 'multi_logloss', 
    'verbosity': -1, 
    'boosting_type': 'gbdt', 
    'feature_pre_filter': False, 
    'lambda_l1': 7.816997746908849, 
    'lambda_l2': 1.543480389482826e-08, 
    'num_leaves': 31, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.4417446162614227, 
    'bagging_freq': 4, 
    'min_child_samples': 5
}

In [303]:
lgbm_params2 = {'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss', 'verbosity': -1, 
'boosting_type': 'gbdt', 'feature_pre_filter': False, 'lambda_l1': 1.090149418097443, 'lambda_l2': 2.9628897819981876, 
'num_leaves': 4, 'feature_fraction': 0.58, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 5}

In [304]:
fold_metrics_model = []
test_preds = {}

for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(
        fold=fold, 
        df=df_train_processed,         
        cols_to_leave=cols_to_leave,
        target_col_name=Config.TARGET_COL_NAME+"_encoded"
    )
    fold_f1_score, model, fold_val_pred_proba = run_training(X_train, y_train, X_val, y_val, lgbm_params2)
    print(f"fold {fold } f1 score = {fold_f1_score}")    
    # add the validation probability predictions for the fold to a new column in train data
    df_train.loc[df_train.kfold == fold, tgt_proba_cols] = fold_val_pred_proba 
    # for this fold make predictions on the validation set
    # in a multiclass classification setting, the prediction from the model is probability for each target class
    # so we need to use the argmax to get the predicted class
    df_train.loc[df_train.kfold == fold, "val_preds"] = np.argmax(model.predict(X_val), axis=1)
    fold_metrics_model.append((round(fold_f1_score, 4), model))

fold 0 f1 score = 0.7408906882591093
fold 1 f1 score = 0.757085020242915
fold 2 f1 score = 0.7368421052631579
fold 3 f1 score = 0.6923076923076923
fold 4 f1 score = 0.728744939271255


In [305]:
import statistics

fold_metrics = [item[0] for item in fold_metrics_model]
print(f"f1 scores = {fold_metrics}")    
cv_auc_mean = statistics.mean(fold_metrics)
cv_auc_stdev = statistics.stdev(fold_metrics)
print(f"mean f1 across folds = {cv_auc_mean}, f1 stdev across folds = {cv_auc_stdev}")

f1 scores = [0.7409, 0.7571, 0.7368, 0.6923, 0.7287]
mean f1 across folds = 0.73116, f1 stdev across folds = 0.024060922675575004


In [306]:
# calculate the cv score
cv_f1 = f1_score(y_pred=df_train.val_preds, y_true=df_train.outcome_encoded, average="micro")
print(f"Cross validation F1 score across {len(fold_metrics)} folds = {cv_f1}")

Cross validation F1 score across 5 folds = 0.7311740890688259
