<a href="https://colab.research.google.com/github/anabarrerar/Machine_Learning/blob/main/Ensemble_Boosting/lgbm_xgb_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and Import libraries

In [None]:
#!pip install catboost

In [None]:
#!pip install xgboost

In [None]:
#!pip install lightgbm

In [None]:
#Importing machine learning algorithms
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

#Importing other packages 
import timeit
import pandas as pd
import numpy as np

#Importing packages for machine learning operations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Load the data

train=pd.read_csv("train.csv")

In [None]:
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1})

In [None]:
features = train.columns[1:-1] 
label    = train.columns[-1]

In [None]:
X=train[features] #Defining features for training
y=train[label]  #Target

X_train, X_test,  y_train, y_test= train_test_split(X,y, random_state=2023, test_size=0.30) #Split the data

In [None]:
#Metric evaluation  
def metrics(y_pred_test):
    loss = log_loss(y_test, y_pred_test)
    print('Log Loss:', loss)

In [None]:
# Function to training model, log training and prediction time and tracking performance 
def run_model(model, description, key, cat_features=[]):
  if key =='LGB':
    #Description
    print("Description:", description)

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train, categorical_feature=cat_features)
    stop = timeit.default_timer()
    print("Training time:", stop-start)

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    print("Prediction time:", stop-start)

    #Performance evaluation
    metrics(y_pred_test)

  elif key =='CAT':
    #Description
    print("Description:", description)

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train, 
              eval_set=(X_test, y_test),
              cat_features=cat_features,
              use_best_model=True)
    stop = timeit.default_timer()
    print("Training time:", stop - start)

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    print("Prediction time:", stop - start)

    #Performance evaluation
    metrics(y_pred_test)

  else:
    #Description
    print("Description:", description)

    #Training session
    start = timeit.default_timer()
    model.fit(X_train,y_train)
    stop = timeit.default_timer()
    print("Training time:", stop-start)

    #Prediction session
    start = timeit.default_timer()
    y_pred_test = model.predict(X_test)
    stop = timeit.default_timer()
    print("Prediction time:", stop - start)

    #Performance evaluation
    metrics(y_pred_test)

## LightGBM

In [None]:
#Default LightGBM with categorical feature support
model_lgb = lgb.LGBMClassifier()
run_model(model_lgb, 'Default LightGBM with categorical support',key='LGB', cat_features=["EJ"])

Description: Default LightGBM with categorical support
Training time: 0.17611814399998593
Prediction time: 0.002652114999989408
Log Loss: 3.8756616547437797


## XGBoost

In [None]:
# Default XGBoost 
model_xgb = xgb.XGBClassifier()
run_model(model_xgb, 'Default XGBoost', key='XGB')

Description: Default XGBoost
Training time: 1.4759554290000096
Prediction time: 0.00368457600001193
Log Loss: 2.906746241057835


## CatBoost

In [None]:
#With categorical encoding
model_cat= cb.CatBoostClassifier()
run_model(model_cat,'Default Catboost with categorical support','CAT', cat_features=["EJ"])

Description: Default Catboost with categorical support
Learning rate set to 0.025741
0:	learn: 0.6688296	test: 0.6730320	best: 0.6730320 (0)	total: 58.5ms	remaining: 58.4s
1:	learn: 0.6377709	test: 0.6479359	best: 0.6479359 (1)	total: 67.8ms	remaining: 33.8s
2:	learn: 0.6181995	test: 0.6315047	best: 0.6315047 (2)	total: 77.2ms	remaining: 25.7s
3:	learn: 0.5940528	test: 0.6138874	best: 0.6138874 (3)	total: 86.6ms	remaining: 21.6s
4:	learn: 0.5747082	test: 0.5980559	best: 0.5980559 (4)	total: 96.1ms	remaining: 19.1s
5:	learn: 0.5495636	test: 0.5773529	best: 0.5773529 (5)	total: 107ms	remaining: 17.7s
6:	learn: 0.5307664	test: 0.5632692	best: 0.5632692 (6)	total: 116ms	remaining: 16.5s
7:	learn: 0.5102392	test: 0.5491974	best: 0.5491974 (7)	total: 125ms	remaining: 15.6s
8:	learn: 0.4879853	test: 0.5346828	best: 0.5346828 (8)	total: 135ms	remaining: 14.9s
9:	learn: 0.4725601	test: 0.5218807	best: 0.5218807 (9)	total: 144ms	remaining: 14.3s
10:	learn: 0.4601873	test: 0.5131602	best: 0.51316

## Hyperparameter tuning

### LightGBM 

In [None]:
start = timeit.default_timer()
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='log_loss')
param_dist = {"max_depth": [3,4,7],
               "learning_rate" : [0.01,0.05,0.08, 0.1],
               "num_leaves": [50,100,200],
               "n_estimators": [100,500,1000]
              }
grid_search = GridSearchCV(model_lgb_cat_def, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="neg_log_loss", verbose=5)
grid_search.fit(X_train,y_train)
print(grid_search.best_estimator_)

stop = timeit.default_timer()
print('Time: ', stop - start) 


In [None]:
params = {"max_depth": 4, "learning_rate" : 0.08, "num_leaves": 100,  "n_estimators": 1000}

In [None]:
#With Catgeorical Features
model_lgb_cat_tun = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='log_loss',**params)
run_model(model_lgb_cat_tun, 'Tuned LightGBM with categorical support', 'LGB', cat_features=["EJ"])
     

###XGBoost

In [None]:
from xgboost import XGBClassifier

start = timeit.default_timer()
xgb_model = xgb.XGBClassifier()
param_dist = {"max_depth": [2,5,7,10],
              "min_child_weight" : [2,6,10,],
              "n_estimators": [100, 500, 1000],
              "learning_rate" : [0.01,0.05,0.08, 0.1, 0.5, 0.8]}
grid_search = GridSearchCV(xgb_model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

stop = timeit.default_timer()
print('Time: ', stop - start)  

In [None]:
params = {"max_depth": 5, "learning_rate": 0.8, "min_child_weight": 6,  "n_estimators": 1000}

In [None]:
# Tuned XGBoost
model_xgb_tun = xgb.XGBClassifier(**params)
run_model(model_xgb_tun, 'Tuned XGBoost','XGB')

### CatBoost

In [None]:
start = timeit.default_timer()
cb_model = cb.CatBoostClassifier()

params = {'depth': [2, 6, 10],
           'learning_rate' : [0.01, 0.05, 0.08, 0.1,0.5],
          'l2_leaf_reg': [1, 3, 5, 7, 9],
          'iterations': [500, 700, 900, 1000]}

grid_search = GridSearchCV(cb_model, param_grid=params, cv = 5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

stop = timeit.default_timer()
print('Time: ', stop - start) 

In [None]:
# Tuned parameters for Catboost
params = {"depth": 10, "learning_rate": 0.5, "iterations": 1000, "l2_leaf_reg": 5}

In [None]:
#Tuned Catboost with no categorical feature support
model_cat_tun = cb.CatBoostClassifier(**params)
run_model(model_cat_tun,'Tuned Catboost without categorical support', key='CAT',cat_features=["EJ"])