# LightGBM Classifier

**Model Template**

Costa Rica <br/>
Belinda Brown, belindabrownr04@gmail.com <br/>
April, 2021 <br/>

In [None]:
# Python 3.6 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import argparse
import os
import glob
import joblib
from azureml.core import Run
from utils import load_data
import seaborn as sns # statistical data visualization

## Read data (input)

In [None]:
import pandas as pd

pd_df = pd.read_csv("./file_name.csv", sep=',', encoding='utf-8', engine='python',error_bad_lines=False)

print("Data Frame Shape:  ", pd_df.shape)

## View summary of dataset

In [None]:
# view summary of dataset
pd_df.info()

## Check the distribution of target variable

In [None]:
# check the distribution of the target variable
pd_df['target_column'].value_counts()

## Declare feature vector and target variable

In [None]:
# Identify columns class from mapped dataset
target_column = mapped_dataset['column_name_0', 'column_name_1']
numerical_columns = mapped_dataset['column_name_2', 'column_name_5']
categorical_columns = mapped_dataset['column_name_6', 'column_name_7']
exclude_columns = mapped_dataset['column_name_8', 'column_name_9']

## Split dataset into training and test set

In [None]:
X = pd_df[numerical_columns]
y = pd_df[target_column]

In [None]:
# split the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## LightGBM Model Development and Training

In [None]:
# build the lightgbm model
import lightgbm as lgb
clf = lgb.LGBMClassifier()

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n')

## Model Prediction

In [None]:
# get hold of the current run
run = Run.get_context()

In [None]:
# predict the results
y_pred=clf.predict(X_test)

## Accuracy

In [None]:
# view accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
run.log('accuracy', np.float(accuracy_score(y_train, y_pred_train)))

## Compare train and test set accuracy

In [None]:
y_pred_train = clf.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

## Check for Overfitting

In [None]:
# print the scores on training and test set
print('Training set score: {:.4f}'.format(clf.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf.score(X_test, y_test)))

## Confusion Matrix

In [None]:
# view confusion-matrix
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import plot_confusion_matrix
np.set_printoptions(precision=2)
#### ------ Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
  disp = plot_confusion_matrix(clf, X_test, y_test,
                                 display_labels=scalar_v,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
  disp.ax_.set_title(title)

  print(title)
  print(disp.confusion_matrix)

## Classification Metrices

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## Save the model pkl

In [None]:
os.makedirs('./outputs/', exist_ok=True)

In [None]:
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='./outputs/model_name.pkl')

## References 

[1] From https://www.kaggle.com/prashant111/lightgbm-classifier-in-python#Classification-Metrices <br/>
[2] From https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-train-models-with-aml <br/>
[3] From https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py <br/>