In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

In [24]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "outcome"    
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL = "LGBM"

DATA_PATH = "./data/"

In [25]:
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')

In [26]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME, num_folds=Config.NUM_FOLDS)
df_train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,kfold
0,753,no,adult,535381,39.4,86.0,21.0,normal,normal,pale_pink,...,75.0,cloudy,2.0,yes,3205,0,0,no,euthanized,3
1,582,yes,adult,535029,37.5,112.0,12.0,cold,normal,bright_pink,...,57.0,serosanguious,2.0,yes,4205,0,0,no,euthanized,0
2,548,yes,adult,529461,38.5,72.0,44.0,cool,reduced,bright_red,...,8.6,cloudy,4.3,yes,2112,0,0,yes,died,1
3,113,yes,adult,534157,38.4,40.0,16.0,cool,reduced,pale_pink,...,77.0,serosanguious,2.0,yes,2209,0,0,no,euthanized,3
4,174,yes,adult,529777,38.9,40.0,24.0,normal,normal,pale_pink,...,6.0,clear,5.4,yes,2206,0,0,yes,lived,1


In [27]:
# Get the count of each column type
df_train.dtypes.value_counts()

object     17
float64     7
int64       6
dtype: int64

In [28]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()
# remove target "outcome" from the list cols_str
cols_str.remove(Config.TARGET_COL_NAME)

In [29]:
print(cols_float)

['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein']


In [30]:
# print the list cols_str with 5 words per line
def print_list_cols(cols_str):
    for i in range(0, len(cols_str), 5):
        print(cols_str[i:i+5])

print_list_cols(cols_str)    

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane']
['capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube']
['nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion']
['cp_data']


In [34]:
# For each categorical feature, calculate distinct categories and their counts
def get_category_summary(df):
    # Initialize an empty DataFrame to store the results
    category_summary = pd.DataFrame(columns=['Feature', 'Distinct_Categories', 'Category_Count'])
    # Loop through columns to identify categorical features
    for column in df.columns:
        if df[column].dtype == 'object':
            # For categorical features, calculate distinct categories and their counts        
            cat_val_cnt = df[column].value_counts()        
            # create a dataframe for this specific categorical feature, distinct categories and their count
            cat_feature_df = pd.DataFrame(data={
                'Feature': [column] * len(cat_val_cnt),
                'Distinct_Categories': cat_val_cnt.index.values.tolist(), 
                'Category_Count': cat_val_cnt.values.tolist()
            })
            # Append the results to the categorsummary DataFrame
            category_summary= category_summary.append(cat_feature_df)
    
    category_summary = category_summary.reset_index(drop=True)
    return category_summary

In [37]:
df_categories = get_category_summary(df_train)
df_categories[df_categories.Category_Count < 10]

Unnamed: 0,Feature,Distinct_Categories,Category_Count
13,peripheral_pulse,increased,4
23,capillary_refill_time,,6
24,capillary_refill_time,3,2
31,pain,slight,1
37,peristalsis,distend_small,1
51,nasogastric_reflux,slight,1
57,rectal_exam_feces,serosanguious,1


#### Encoding Categorical Columns using different strategies

In [9]:
# use one hot encoding for categorical columns using pandas get_dummies
# since some of the categories are missing for some categorical features in test data we combine both test and train befor
# doing one hot encoding
df_combined = pd.concat([df_train, df_test])
df_combined = pd.get_dummies(df_combined, prefix=cols_str, columns=cols_str)
df_train = df_combined[:len(df_train)]
df_test = df_combined[len(df_train):]
print(f"len(df_combined.columns)={len(df_combined.columns)}")
print(f"len(df_train.columns)={len(df_train.columns)}")
print(f"len(df_test.columns)={len(df_train.columns)}")

##### Encoding of categorical columns using sklearn OneHotEncoder

len(df_combined.columns)=87
len(df_train.columns)=87
len(df_test.columns)=87


#### Label Encoding of target

In [11]:
# Encode the target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(df_train[Config.TARGET_COL_NAME])
# add the target_encoded as a new column to the dataframe
df_train[Config.TARGET_COL_NAME + "_encoded"] = target_encoded

In [12]:
tgt_proba_cols = [Config.TARGET_COL_NAME + "_proba_" + tgt_cls for tgt_cls in label_encoder.classes_]

In [13]:
cols_to_leave = ["id", "kfold", Config.TARGET_COL_NAME, Config.TARGET_COL_NAME + "_encoded"]
col_names = [item for item in df_train.columns.values.tolist() if item not in cols_to_leave]
print(f"len(col_names)={len(col_names)}")        
# get all columns from df_train that are not of type float
noncont_col_names = [item for item in col_names if item not in cols_float]
print(f"len(noncont_col_names)={len(noncont_col_names)}")        
cont_col_names = [item for item in cols_float if item not in cols_to_leave]
print(f"len(cont_col_names)={len(cont_col_names)}")        
# col_names_test = [item for item in df_test.columns.values.tolist() if item not in cols_to_leave]        
# print(f"len(col_names_test)={len(col_names_test)}")        

len(col_names)=84
len(noncont_col_names)=77
len(cont_col_names)=7


In [14]:
def normalize_data(df, cont_col_names, noncont_col_names):
    # normalize continuous features
    scaler = StandardScaler()
    X_cont = df[cont_col_names]    
    X_cont_scaled = scaler.fit_transform(X_cont)    
    # combine the normalized continuous features with the non normalized
    X_scaled = np.concatenate([X_cont_scaled, df[noncont_col_names]], axis=1)    
    return X_scaled

In [15]:
def get_fold_data(fold, df, cont_col_names, noncont_col_names, target_col_name):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    # normalize the data
    X_train = normalize_data(df_train, cont_col_names, noncont_col_names)
    X_val = normalize_data(df_val, cont_col_names, noncont_col_names)
    y_train = df_train[target_col_name]
    y_val = df_val[target_col_name]
    return X_train, y_train, X_val, y_val 

In [16]:
def run_training(train_X, train_y, val_X, val_y, params=None):
    # Create the Logistic Regression model
    model = LogisticRegression(
        random_state=Config.RANDOM_SEED,
        n_jobs=-1, 
        solver=params["solver"],         
        max_iter=params["max_iter"], 
        multi_class=params["multi_class"],
        C=params["C"],
        penalty=params["penalty"]
    )    
    model.fit(train_X, train_y.ravel())
    val_y_pred = model.predict(val_X)
    val_y_proba = model.predict_proba(val_X)
    f1 = f1_score(val_y, val_y_pred, average="micro")
    return f1, model, val_y_proba

In [17]:
# RandomizedSearchCV trials reveal that l1 penalty gives the best f1 score, 
# but l1 penalty works with liblinear solver

lr_model_params = {
    "C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 1000.0],
    "penalty": ["l1", "l2"],
    "max_iter": [100, 200, 500, 1000],
    "multi_class": ["auto", "ovr", "multinomial"]
}
# Create the Logistic Regression model
lr_model = LogisticRegression(    
    n_jobs=-1, 
    random_state=Config.RANDOM_SEED,
    solver="liblinear",
)
random_search_cv = model_selection.RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=lr_model_params,
    scoring="f1_micro",
    n_jobs=-1    
)    
X_train = normalize_data(df_train, cont_col_names, noncont_col_names)
y_train = df_train[Config.TARGET_COL_NAME+"_encoded"]
random_search_cv.fit(X_train, y_train)
print(f"best params = {random_search_cv.best_params_}")
print(f"best score = {random_search_cv.best_score_}")

best params = {'penalty': 'l1', 'multi_class': 'auto', 'max_iter': 100, 'C': 10.0}
best score = 0.6777327935222672


In [18]:
fold_metrics_model = []
test_preds = {}
model_params = {'C': 1.0, 'penalty': 'l1', 'max_iter': 200, 'multi_class': 'ovr', 'solver': 'liblinear'}

for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(
        fold=fold, 
        df=df_train, 
        cont_col_names=cont_col_names, 
        noncont_col_names=noncont_col_names,
        target_col_name=Config.TARGET_COL_NAME+"_encoded"
    )
    fold_f1_score, model, fold_val_pred_proba = run_training(X_train, y_train, X_val, y_val, params=model_params)
    print(f"fold {fold } f1 score = {fold_f1_score}")    
    # add the validation probability predictions for the fold to a new column in train data
    df_train.loc[df_train.kfold == fold, tgt_proba_cols] = fold_val_pred_proba    
    X_test = normalize_data(df_test, cont_col_names, noncont_col_names)
    fold_test_preds = model.predict(X_test)
    pred_col_name = f"fold_{fold}_test_preds"
    test_preds[pred_col_name] = fold_test_preds    
    fold_metrics_model.append((round(fold_f1_score, 4), model))

fold 0 f1 score = 0.6518218623481782
fold 1 f1 score = 0.7206477732793523
fold 2 f1 score = 0.6923076923076923
fold 3 f1 score = 0.6761133603238867
fold 4 f1 score = 0.6882591093117408


In [19]:
df_train

Unnamed: 0,id,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,...,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes,outcome_encoded,outcome_proba_died,outcome_proba_euthanized,outcome_proba_lived
0,753,535381,39.4,86.0,21.0,4.0,48.0,75.0,2.0,3205,...,1,0,0,1,1,0,1,0.003714,0.359330,0.636956
1,582,535029,37.5,112.0,12.0,2.0,54.0,57.0,2.0,4205,...,0,1,0,1,1,0,1,0.072363,0.368163,0.559474
2,548,529461,38.5,72.0,44.0,4.5,53.0,8.6,4.3,2112,...,1,0,0,1,0,1,0,0.466856,0.051546,0.481597
3,113,534157,38.4,40.0,16.0,2.0,52.0,77.0,2.0,2209,...,0,1,0,1,1,0,1,0.012770,0.543790,0.443440
4,174,529777,38.9,40.0,24.0,5.3,36.0,6.0,5.4,2206,...,0,0,0,1,0,1,2,0.209471,0.001933,0.788596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,1044,529340,37.9,60.0,30.0,4.5,35.0,6.6,1.5,5206,...,1,0,0,1,0,1,0,0.167593,0.010994,0.821413
1231,1095,5275212,38.0,96.0,28.0,4.5,69.0,8.1,4.5,2205,...,0,1,0,1,1,0,0,0.802745,0.160523,0.036732
1232,1130,529475,38.4,52.0,30.0,4.0,37.0,6.6,5.0,3111,...,1,0,0,1,0,1,2,0.043094,0.003586,0.953320
1233,860,528570,36.1,50.0,50.0,3.0,35.0,6.5,3.6,2209,...,1,0,0,1,0,1,0,0.196837,0.031486,0.771677
