In [188]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder #, TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from category_encoders import TargetEncoder
# from category_encoders.wrapper import PolynomialWrapper

warnings.filterwarnings('ignore')

In [189]:
class ModelType:
    LGBM = "LGBM"
    XGB = "XGB"
    RF = "RF"
    LR = "LR"
    CATBOOST = "CATBOOST"

In [190]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "outcome"    
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = ModelType.RF

DATA_PATH = "./data/"

In [191]:
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')

In [192]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME, num_folds=Config.NUM_FOLDS)
df_train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,kfold
0,753,no,adult,535381,39.4,86.0,21.0,normal,normal,pale_pink,...,75.0,cloudy,2.0,yes,3205,0,0,no,euthanized,3
1,582,yes,adult,535029,37.5,112.0,12.0,cold,normal,bright_pink,...,57.0,serosanguious,2.0,yes,4205,0,0,no,euthanized,0
2,548,yes,adult,529461,38.5,72.0,44.0,cool,reduced,bright_red,...,8.6,cloudy,4.3,yes,2112,0,0,yes,died,1
3,113,yes,adult,534157,38.4,40.0,16.0,cool,reduced,pale_pink,...,77.0,serosanguious,2.0,yes,2209,0,0,no,euthanized,3
4,174,yes,adult,529777,38.9,40.0,24.0,normal,normal,pale_pink,...,6.0,clear,5.4,yes,2206,0,0,yes,lived,1


In [193]:
# Get the count of each column type
df_train.dtypes.value_counts()

object     17
float64     7
int64       6
dtype: int64

In [194]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()
# remove target "outcome" from the list cols_str
cols_str.remove(Config.TARGET_COL_NAME)

In [195]:
print(cols_float)

['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein']


In [196]:
# print the list cols_str with 5 words per line
def print_list_cols(cols_str):
    for i in range(0, len(cols_str), 5):
        print(cols_str[i:i+5])

print_list_cols(cols_str)    

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane']
['capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube']
['nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion']
['cp_data']


In [197]:
# For each categorical feature, calculate distinct categories and their counts
def get_category_summary(df):
    # Initialize an empty DataFrame to store the results
    category_summary = pd.DataFrame(columns=['Feature', 'Distinct_Categories', 'Category_Count'])
    # Loop through columns to identify categorical features
    for column in df.columns:
        if df[column].dtype == 'object':
            # For categorical features, calculate distinct categories and their counts        
            cat_val_cnt = df[column].value_counts()        
            # create a dataframe for this specific categorical feature, distinct categories and their count
            cat_feature_df = pd.DataFrame(data={
                'Feature': [column] * len(cat_val_cnt),
                'Distinct_Categories': cat_val_cnt.index.values.tolist(), 
                'Category_Count': cat_val_cnt.values.tolist()
            })
            # Append the results to the categorsummary DataFrame
            category_summary= category_summary.append(cat_feature_df)
    
    category_summary = category_summary.reset_index(drop=True)
    return category_summary

In [198]:
df_categories_train = get_category_summary(df_train)
df_categories_test = get_category_summary(df_test)

In [199]:
train_cat_notin_test = df_categories_train[["Feature", "Distinct_Categories"]].merge(
                            df_categories_test[["Feature", "Distinct_Categories"]], 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

train_cat_notin_test = df_categories_train.merge(
                            df_categories_test, 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

# Find rows present in df2 but missing in df1
test_cat_notin_train = df_categories_test[["Feature", "Distinct_Categories"]].merge(
                            df_categories_train[["Feature", "Distinct_Categories"]], 
                            on=['Feature', 'Distinct_Categories'], 
                            how='left', 
                            indicator=True
                        ).query('_merge == "left_only"')

# # Drop the '_merge' column and reset the index
train_cat_notin_test = train_cat_notin_test.drop(columns=['_merge']).reset_index(drop=True)
test_cat_notin_train = test_cat_notin_train.drop(columns=['_merge']).reset_index(drop=True)

In [200]:
train_cat_notin_test

Unnamed: 0,Feature,Distinct_Categories,Category_Count_x,Category_Count_y
0,pain,slight,1,
1,peristalsis,distend_small,1,
2,nasogastric_reflux,slight,1,
3,rectal_exam_feces,serosanguious,1,
4,outcome,lived,574,
5,outcome,died,410,
6,outcome,euthanized,251,


In [201]:
test_cat_notin_train

Unnamed: 0,Feature,Distinct_Categories
0,pain,moderate


In [202]:
# drop the row from test which has category not present in train
# df_test = df_test.drop(df_test[df_test.pain == "moderate"].index)

In [203]:
cols_to_leave = ["id", "kfold", Config.TARGET_COL_NAME, Config.TARGET_COL_NAME + "_encoded"]
col_names = [item for item in df_train.columns.values.tolist() if item not in cols_to_leave]
print(f"len(col_names)={len(col_names)}")        
# get all columns from df_train that are not of type float
noncont_col_names = [item for item in col_names if item not in cols_float]
cont_col_names = [item for item in cols_float if item not in cols_to_leave]
cat_col_names = [item for item in cols_str if item not in cols_to_leave]
print(f"len(cont_col_names)={len(cont_col_names)}, len(cat_col_names)={len(cat_col_names)}, len(noncont_col_names)={len(noncont_col_names)}")      

len(col_names)=27
len(cont_col_names)=7, len(cat_col_names)=16, len(noncont_col_names)=20


#### Encoding Categorical Columns using different strategies

In [204]:
# # use one hot encoding for categorical columns using pandas get_dummies
# # since some of the categories are missing for some categorical features in test data we combine both test and train befor
# # doing one hot encoding
# df_combined = pd.concat([df_train, df_test])
# df_combined = pd.get_dummies(df_combined, prefix=cols_str, columns=cols_str)
# df_train = df_combined[:len(df_train)]
# df_test = df_combined[len(df_train):]
# print(f"len(df_combined.columns)={len(df_combined.columns)}")
# print(f"len(df_train.columns)={len(df_train.columns)}")
# print(f"len(df_test.columns)={len(df_train.columns)}")

##### Encoding of categorical columns using sklearn OneHotEncoder

In [205]:
one_hot_enc = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False, min_frequency=5)
one_hot_enc = one_hot_enc.fit(df_train[cat_col_names])
encoded_train = one_hot_enc.transform(df_train[cat_col_names])
encoded_test = one_hot_enc.transform(df_test[cat_col_names])
print(f"encoded_train.shape = {encoded_train.shape}")
print(f"encoded_test.shape = {encoded_test.shape}")
df_train.drop(columns=cat_col_names, inplace=True)
df_test.drop(columns=cat_col_names, inplace=True)
# drop the categorical columns from df_train
df_train_oh = pd.DataFrame(encoded_train, columns=one_hot_enc.get_feature_names_out())
df_test_oh = pd.DataFrame(encoded_test, columns=one_hot_enc.get_feature_names_out())
# append the one hot encoded data to df_train
df_train_processed = pd.concat([df_train, df_train_oh], axis=1)
df_test_processed = pd.concat([df_test, df_test_oh], axis=1)

encoded_train.shape = (1235, 72)
encoded_test.shape = (824, 72)


In [206]:
print(f"len(df_train.columns) = {len(df_train.columns)}")
print(f"len(df_test.columns) = {len(df_test.columns)}")
print(f"len(df_test_processed.columns) = {len(df_test_processed.columns)}")
print(f"len(df_train_processed.columns) = {len(df_train_processed.columns)}")

len(df_train.columns) = 14
len(df_test.columns) = 12
len(df_test_processed.columns) = 84
len(df_train_processed.columns) = 86


#### Label Encoding of target

In [207]:
# Encode the target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(df_train[Config.TARGET_COL_NAME])
# add the target_encoded as a new column to the dataframe
# df_train[Config.TARGET_COL_NAME + "_encoded"] = target_encoded
df_train_processed[Config.TARGET_COL_NAME + "_encoded"] = target_encoded

##### Target encoding of categorical features

In [208]:
# from category_encoders import TargetEncoder
# from category_encoders.wrapper import PolynomialWrapper

# # This target encoder acts on either binary or a continuous target
# base_target_encoder = TargetEncoder(cols=cat_col_names, handle_unknown="return_nan", handle_missing="return_nan")
# # This is a wrapper to be used for target encoding in case of multiclass target
# target_encoder = PolynomialWrapper(base_target_encoder)
# encoded = target_encoder.fit_transform(df_train.loc[:, cat_col_names], df_train.loc[:, Config.TARGET_COL_NAME+"_encoded"])


In [209]:
tgt_proba_cols = [Config.TARGET_COL_NAME + "_proba_" + tgt_cls for tgt_cls in label_encoder.classes_]
tgt_proba_cols

['outcome_proba_died', 'outcome_proba_euthanized', 'outcome_proba_lived']

In [210]:
def normalize_data(df, cont_col_names, cols_to_leave):
    # normalize continuous features
    scaler = StandardScaler()
    X_cont = df[cont_col_names]    
    X_cont_scaled = scaler.fit_transform(X_cont)     
    # get the columns other than continuous features
    other_col_names = [item for item in df.columns.values.tolist() if item not in cont_col_names + cols_to_leave]
    # combine the normalized continuous features with others
    X_processed = np.concatenate([X_cont_scaled, df[other_col_names]], axis=1)    
    return X_processed

In [211]:
def get_fold_data(fold, df, cont_col_names, cols_to_leave, target_col_name):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    # normalize the data
    X_train = normalize_data(df_train, cont_col_names, cols_to_leave)
    X_val = normalize_data(df_val, cont_col_names, cols_to_leave)
    y_train = df_train[target_col_name]
    y_val = df_val[target_col_name]
    return X_train, y_train, X_val, y_val 

In [212]:
def create_model(model_params, model_type):
    model = None
    if model_type == ModelType.LR:
        model = LogisticRegression(
            random_state=Config.RANDOM_SEED,
            n_jobs=-1, 
            solver=model_params["solver"],         
            max_iter=model_params["max_iter"], 
            multi_class=model_params["multi_class"],
            C=model_params["C"],
            penalty=model_params["penalty"]
        )
    elif model_type == ModelType.RF:
        model = RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],                 
                    max_depth=model_params["max_depth"],
                    min_samples_leaf=model_params["min_samples_leaf"],
                    min_samples_split=model_params["min_samples_split"],
                    max_features=model_params["max_features"],
                    random_state=Config.RANDOM_SEED,
                    n_jobs=-1
                )     
    return model

In [213]:
def run_training(model, train_X, train_y, val_X, val_y):    
    model.fit(train_X, train_y.ravel())
    val_y_pred = model.predict(val_X)
    val_y_proba = model.predict_proba(val_X)
    f1 = f1_score(val_y, val_y_pred, average="micro")
    return f1, model, val_y_proba

In [214]:
def get_model_tuning_params(trial, model_type):
    if model_type == ModelType.LR:
        return {
            "solver": trial.suggest_categorical("solver", ["liblinear", "saga"]),
            "C": trial.suggest_loguniform("C", low=0.001, high=1000.0),
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
            "max_iter": trial.suggest_categorical("max_iter", [100, 200, 500, 1000]),
            "multi_class": trial.suggest_categorical("multi_class", ["auto", "ovr"])
        }
    elif model_type == ModelType.RF:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 4, 32),
            "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", [1, 2, 4]),
            "min_samples_split": trial.suggest_categorical("min_samples_split", [2, 4, 8]),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"])
        }

In [215]:
import optuna
import statistics

def rf_objective(trial):       
    params = get_model_tuning_params(trial, ModelType.LR)
    model = create_model(params, ModelType.LR)
    fold_f1 = []
    for fold in range(Config.NUM_FOLDS):
        train_X, train_y, val_X, val_y = get_fold_data(
                                            fold=fold, 
                                            df=df_train_processed, 
                                            cont_col_names=cont_col_names, 
                                            cols_to_leave=cols_to_leave,
                                            target_col_name=Config.TARGET_COL_NAME+"_encoded"
                                        )
        f1, _, _ = run_training(model, train_X, train_y, val_X, val_y)
        fold_f1.append(f1)
    mean_f1 = statistics.mean(fold_f1)                
    return mean_f1

# study = optuna.create_study(direction="maximize", study_name="RFModelTuning")    
# study.optimize(rf_objective, n_trials=20)
# print("Best trial:")
# print(study.best_params)

In [216]:
# # RandomizedSearchCV trials reveal that l1 penalty gives the best f1 score, 
# # but l1 penalty works with liblinear solver

# lr_model_params = {
#     "C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 1000.0],
#     "penalty": ["l1", "l2"],
#     "max_iter": [100, 200, 500, 1000],
#     "multi_class": ["auto", "ovr", "multinomial"]
# }
# # Create the Logistic Regression model
# lr_model = LogisticRegression(    
#     n_jobs=-1, 
#     random_state=Config.RANDOM_SEED,
#     solver="liblinear",
# )
# random_search_cv = model_selection.RandomizedSearchCV(
#     estimator=lr_model,
#     param_distributions=lr_model_params,
#     scoring="f1_micro",
#     n_jobs=-1    
# )    
# X_train = normalize_data(df_train_processed, cont_col_names, cols_to_leave)
# y_train = df_train_processed[Config.TARGET_COL_NAME+"_encoded"]
# random_search_cv.fit(X_train, y_train)
# print(f"best params = {random_search_cv.best_params_}")
# print(f"best score = {random_search_cv.best_score_}")

In [217]:
fold_metrics_model = []
test_preds = {}
lr_params = {'C': 1.0, 'penalty': 'l1', 'max_iter': 200, 'multi_class': 'ovr', 'solver': 'liblinear'}
rf_params = {'n_estimators': 299, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 4, 'max_features': 'log2'}
model = create_model(rf_params, ModelType.RF)

for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(
        fold=fold, 
        df=df_train_processed, 
        cont_col_names=cont_col_names, 
        cols_to_leave=cols_to_leave,
        target_col_name=Config.TARGET_COL_NAME+"_encoded"
    )
    fold_f1_score, model, fold_val_pred_proba = run_training(model, X_train, y_train, X_val, y_val)
    print(f"fold {fold } f1 score = {fold_f1_score}")    
    # add the validation probability predictions for the fold to a new column in train data
    df_train.loc[df_train.kfold == fold, tgt_proba_cols] = fold_val_pred_proba    
    X_test = normalize_data(df_test_processed, cont_col_names, cols_to_leave)
    fold_test_preds = model.predict(X_test)
    pred_col_name = f"fold_{fold}_test_preds"
    test_preds[pred_col_name] = fold_test_preds    
    fold_metrics_model.append((round(fold_f1_score, 4), model))

fold 0 f1 score = 0.708502024291498
fold 1 f1 score = 0.757085020242915
fold 2 f1 score = 0.7246963562753036
fold 3 f1 score = 0.6963562753036437
fold 4 f1 score = 0.7004048582995951


In [230]:
rf_clf = fold_metrics_model[0][1]
# get the columns other than continuous features
other_col_names = [item for item in df_train_processed.columns.values.tolist() if item not in cont_col_names + cols_to_leave]
feature_names = cont_col_names + other_col_names
print(f"len(rf_clf.feature_importances_) = {len(rf_clf.feature_importances_)}")
print(f"len(df_train_processed.columns) = {len(df_train_processed.columns)}")
print(f"len(feature_names) = {len(feature_names)}")
feature_importances = pd.Series(rf_clf.feature_importances_, index=feature_names).sort_values(ascending=False)
#feature_importances.plot(kind='bar')

len(rf_clf.feature_importances_) = 83
len(df_train_processed.columns) = 87
len(feature_names) = 83


In [229]:
print(feature_importances.to_string())

total_protein                               0.069035
pulse                                       0.066725
abdomo_protein                              0.064298
hospital_number                             0.058183
packed_cell_volume                          0.056347
nasogastric_reflux_ph                       0.054488
lesion_1                                    0.042332
respiratory_rate                            0.035764
rectal_temp                                 0.034385
pain_mild_pain                              0.024885
abdomo_appearance_serosanguious             0.021804
surgical_lesion_no                          0.017346
peripheral_pulse_normal                     0.017052
pain_severe_pain                            0.016649
mucous_membrane_normal_pink                 0.015959
peripheral_pulse_reduced                    0.015197
surgical_lesion_yes                         0.015142
capillary_refill_time_more_3_sec            0.014961
rectal_exam_feces_absent                    0.

With OneHotEncoder

model_params = {'C': 1.0, 'penalty': 'l1', 'max_iter': 200, 'multi_class': 'ovr', 'solver': 'liblinear'} \
fold 0 f1 score = 0.6518218623481782 \
fold 1 f1 score = 0.7206477732793523 \
fold 2 f1 score = 0.6923076923076923 \
fold 3 f1 score = 0.6761133603238867 \
fold 4 f1 score = 0.6842105263157895 

Best trial:
{'solver': 'liblinear', 'C': 0.3344158923659617, 'penalty': 'l1', 'max_iter': 200, 'multi_class': 'ovr'}
best score = 0.6939271255060728

Random Forest \
Best trial:
{'n_estimators': 299, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 4, 'max_features': 'log2'} \
best score = 0.7174089068825911