***Import all the libraries***

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN


***Add the dataset and convert the date format to integer type.***

In [29]:
train_df = pd.read_csv('../data/cases_train_processed.csv')
test_df = pd.read_csv('../data/cases_test_processed.csv')

# Some preprocessing
# convert date from object type to int type
train_df["date_confirmation"] = pd.to_datetime(train_df["date_confirmation"]).dt.strftime("%Y%m%d").astype(int) 
test_df["date_confirmation"] = pd.to_datetime(test_df["date_confirmation"]).dt.strftime("%Y%m%d").astype(int) 
# train_df2 = train_df.copy() # creating a copy for lightgbm because of different processing method
# test_df2 = test_df.copy() # creating a copy for lightgbm because of different processing method

In [30]:
#use label encoder to normalize categorical features in dataframe
le = LabelEncoder()
categoricalFeatures = ['sex', 'province', 'country','key','additional_information', 'source']
for feat in categoricalFeatures:
    train_df[feat]= le.fit_transform(train_df[feat])

In [31]:
train_df["outcome"].value_counts()

nonhospitalized    149990
hospitalized       125000
recovered           88137
deceased             4499
Name: outcome, dtype: int64

In [32]:
y = train_df['outcome']
X = train_df.drop(['outcome'], axis=1)

In [33]:
#split data into training and validation sets
training_data, validation_data, training_truth, validation_truth = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=11)

In [34]:
oversample = SMOTE(random_state=0)
training_data, training_truth = oversample.fit_resample(training_data, training_truth)

In [35]:
training_truth.value_counts()

deceased           119928
nonhospitalized    119928
hospitalized       119928
recovered          119928
Name: outcome, dtype: int64

In [36]:
validation_truth.value_counts()

nonhospitalized    30062
hospitalized       24908
recovered          17643
deceased             913
Name: outcome, dtype: int64

# 1. K-Nearest Neighbours Classifier
### Testing with the oversampling

In [49]:
knn = KNeighborsClassifier(n_neighbors = 9, weights = 'distance')
knn.fit(training_data, training_truth)

KNeighborsClassifier(n_neighbors=9, weights='distance')

In [50]:
training_prediction = knn.predict(training_data)
validation_prediction = knn.predict(validation_data)

In [51]:
training_accuracy = metrics.accuracy_score(training_prediction, training_truth)
scores_training = metrics.classification_report(training_truth,training_prediction)
validation_accuracy = metrics.accuracy_score(validation_prediction, validation_truth)
scores_validation = metrics.classification_report(validation_truth,validation_prediction)
print("K-Nearest Neighbours Model Predictions:\n")

print('TRAINING\nAccuracy score: {0:0.5f}'.format(training_accuracy))
print('Classification report: \n',scores_training)
print('\nVALIDATION\nAccuracy score: {0:0.5f}'.format(validation_accuracy))
print('Classification report: \n',scores_validation)

K-Nearest Neighbours Model Predictions:

TRAINING
Accuracy score: 0.78873
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.69      0.78      0.73    119928
   hospitalized       0.74      0.68      0.71    119928
nonhospitalized       1.00      1.00      1.00    119928
      recovered       0.74      0.70      0.72    119928

       accuracy                           0.79    479712
      macro avg       0.79      0.79      0.79    479712
   weighted avg       0.79      0.79      0.79    479712


VALIDATION
Accuracy score: 0.79084
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.05      0.49      0.09       913
   hospitalized       0.85      0.66      0.74     24908
nonhospitalized       0.99      0.99      0.99     30062
      recovered       0.73      0.66      0.70     17643

       accuracy                           0.79     73526
      macro avg       0.66      0.70 

# 2. Random Forests Classifier
### 2.1 Building the model
*Using the same training and validation dataset split from KNN, the Random Forests classifier is built and saved as a pickle.*

# 3. LightGBM Classifier
### 3.1 Building the model
*First, convert all the categorical features into the category type which is used by LightGBM for processing categorical data.*

*Split the dataframe into the training data and validation data after separating the outcomes column from the rest of the dataset.*

In [37]:
#'min_data_in_leaf': [20, 30, 50, 100, 300]

def calc_deceasedRecall(truth, prediction):
    return metrics.recall_score(truth,prediction, average=None)[0]

def calc_deceasedPrecision(truth, prediction):
    return metrics.precision_score(truth,prediction, average=None)[0]

def calc_deceasedF1(truth, prediction):
    return metrics.f1_score(truth,prediction, average=None)[0]

In [38]:

scoring_metrics = {
    'f1_deceased' : metrics.make_scorer(calc_deceasedF1),
    'recall_deceased' : metrics.make_scorer(calc_deceasedRecall),
    'overall_accuracy': metrics.make_scorer(metrics.accuracy_score),
    'overall_recall': metrics.make_scorer(metrics.recall_score , average='macro'),
}

param_grid = {
    'num_leaves': [60,90,120],
    'n_estimators': [100,200,300],
    # 'boosting_type': ['gbdt', 'dart', 'goss'],
    'min_data_in_leaf': [60,80]
    }
    # took 1.5 hours

In [39]:
lgb_model = lgb.LGBMClassifier()
# lgb_paras = {'boosting_type': ['gbdt', 'dart', 'goss']}
lgb_grid_search = GridSearchCV(lgb_model, param_grid=param_grid, scoring=scoring_metrics, cv=3, n_jobs=-1, refit='f1_deceased',verbose=-1)
lgb_grid_search.fit(training_data, training_truth)



GridSearchCV(cv=3, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'min_data_in_leaf': [60, 80],
                         'n_estimators': [100, 200, 300],
                         'num_leaves': [60, 90, 120]},
             refit='f1_deceased',
             scoring={'f1_deceased': make_scorer(calc_deceasedF1),
                      'overall_accuracy': make_scorer(accuracy_score),
                      'overall_recall': make_scorer(recall_score, average=macro),
                      'recall_deceased': make_scorer(calc_deceasedRecall)},
             verbose=-1)

In [40]:
# output the parameter variations and scores in a table
lgbm_results = pd.DataFrame(lgb_grid_search.cv_results_)[['param_num_leaves', 'param_n_estimators', 'param_min_data_in_leaf','mean_test_f1_deceased', 'rank_test_f1_deceased','mean_test_recall_deceased', 'mean_test_overall_accuracy','mean_test_overall_recall']]
# best params: 'num_leaves' = 120,'n_estimators' = 300,'boosting_type' = 'gbdt','min_data_in_leaf' = 60


In [41]:
lgbm_results

Unnamed: 0,param_num_leaves,param_n_estimators,param_min_data_in_leaf,mean_test_f1_deceased,rank_test_f1_deceased,mean_test_recall_deceased,mean_test_overall_accuracy,mean_test_overall_recall
0,60,100,60,0.736306,17,0.77454,0.781033,0.781033
1,90,100,60,0.748635,15,0.793109,0.789301,0.789301
2,120,100,60,0.756528,11,0.797737,0.794746,0.794746
3,60,200,60,0.756363,12,0.796495,0.794668,0.794668
4,90,200,60,0.761691,9,0.804341,0.799311,0.799311
5,120,200,60,0.764905,5,0.804816,0.802513,0.802513
6,60,300,60,0.761737,8,0.800188,0.799388,0.799388
7,90,300,60,0.766705,3,0.811996,0.803784,0.803784
8,120,300,60,0.769125,1,0.811045,0.805519,0.805519
9,60,100,80,0.73554,18,0.770095,0.779726,0.779726


In [42]:
# Save as a csv file
lgbm_results.to_csv('../results/smote_lgbm.csv', index=False)

*Create and fit the LightGBM Classifier and save it as a pickle file.*

In [43]:
# Fit the model on the training dataset
lgbm_model = lgb.LGBMClassifier(num_leaves = 120, n_estimators=300,min_data_in_leaf=60)
#fit_params={'feature_name': 'auto', 'categorical_feature': 'auto'}
lgbm_model.fit(training_data, training_truth)

LGBMClassifier(min_data_in_leaf=60, n_estimators=300, num_leaves=120)

In [46]:
# Fit the model on the training dataset
lgbm_model = lgb.LGBMClassifier()
lgbm_model.fit(training_data, training_truth)

LGBMClassifier()

In [47]:
# predict on the training data
training_prediction = lgbm_model.predict(training_data)
#predict on the validation data
validation_prediction = lgbm_model.predict(validation_data)

In [None]:
training_accuracy = metrics.accuracy_score(training_prediction, training_truth1)
scores_training = metrics.classification_report(training_truth1,training_prediction)
validation_accuracy = metrics.accuracy_score(validation_prediction, validation_truth1)
scores_validation = metrics.classification_report(validation_truth1,validation_prediction)
print("LightGBM Model Predictions:\n")
print('TRAINING\nAccuracy score: {0:0.5f}'.format(training_accuracy))
print('Classification report: \n',scores_training)
print('\nVALIDATION\nAccuracy score: {0:0.5f}'.format(validation_accuracy))
print('Classification report: \n',scores_validation)

# SCORES WITHOUT OVERSAMPLING 
# lgb.LGBMClassifier(boosting_type='gbdt',num_leaves = 120, n_estimators=300,min_data_in_leaf=60)

### 3.2 Evaluating the model
*The metrics used to evaluate the model are the Accuracy score, Precision, Recall, F1-score, and the support count.*

In [45]:
# SMOTE
training_accuracy = metrics.accuracy_score(training_prediction, training_truth)
scores_training = metrics.classification_report(training_truth,training_prediction)
validation_accuracy = metrics.accuracy_score(validation_prediction, validation_truth)
scores_validation = metrics.classification_report(validation_truth,validation_prediction)
print("LightGBM Model Predictions:\n")
print('TRAINING\nAccuracy score: {0:0.5f}'.format(training_accuracy))
print('Classification report: \n',scores_training)
print('\nVALIDATION\nAccuracy score: {0:0.5f}'.format(validation_accuracy))
print('Classification report: \n',scores_validation)
# lgb.LGBMClassifier(num_leaves = 120, n_estimators=300,min_data_in_leaf=60)


LightGBM Model Predictions:

TRAINING
Accuracy score: 0.82156
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.76      0.81      0.79    119928
   hospitalized       0.79      0.71      0.75    119928
nonhospitalized       1.00      1.00      1.00    119928
      recovered       0.74      0.77      0.75    119928

       accuracy                           0.82    479712
      macro avg       0.82      0.82      0.82    479712
   weighted avg       0.82      0.82      0.82    479712


VALIDATION
Accuracy score: 0.82590
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.07      0.43      0.12       913
   hospitalized       0.86      0.71      0.78     24908
nonhospitalized       0.99      0.99      0.99     30062
      recovered       0.74      0.73      0.74     17643

       accuracy                           0.83     73526
      macro avg       0.67      0.72      0.66   

In [48]:
training_accuracy = metrics.accuracy_score(training_prediction, training_truth)
scores_training = metrics.classification_report(training_truth,training_prediction)
validation_accuracy = metrics.accuracy_score(validation_prediction, validation_truth)
scores_validation = metrics.classification_report(validation_truth,validation_prediction)
print("LightGBM Model Predictions:\n")
print('TRAINING\nAccuracy score: {0:0.5f}'.format(training_accuracy))
print('Classification report: \n',scores_training)
print('\nVALIDATION\nAccuracy score: {0:0.5f}'.format(validation_accuracy))
print('Classification report: \n',scores_validation)
# lgb.LGBMClassifier()

LightGBM Model Predictions:

TRAINING
Accuracy score: 0.77084
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.70      0.75      0.72    119928
   hospitalized       0.73      0.64      0.68    119928
nonhospitalized       0.99      0.99      0.99    119928
      recovered       0.67      0.70      0.69    119928

       accuracy                           0.77    479712
      macro avg       0.77      0.77      0.77    479712
   weighted avg       0.77      0.77      0.77    479712


VALIDATION
Accuracy score: 0.79466
Classification report: 
                  precision    recall  f1-score   support

       deceased       0.07      0.50      0.12       913
   hospitalized       0.84      0.65      0.74     24908
nonhospitalized       0.99      0.99      0.99     30062
      recovered       0.69      0.68      0.69     17643

       accuracy                           0.79     73526
      macro avg       0.65      0.71      0.63   

*Also created a confusion matrix to provide a good insight to the predictions.*

In [None]:
fig, ax = plt.subplots(figsize = (20, 15), nrows = 1, ncols = 2) 
metrics.plot_confusion_matrix(lgbm_model, training_data, training_truth, cmap = plt.cm.Blues, ax = ax[0], values_format = '.6g') 
ax[0].set_title('LightGBM - Confusion Matrix of Training Data')
metrics.plot_confusion_matrix(lgbm_model, validation_data, validation_truth, cmap = plt.cm.Blues, ax = ax[1], values_format = '.6g') 
ax[1].set_title('LightGBM - Confusion Matrix of Validation Data')

# figure settings
fig.tight_layout()
fig.subplots_adjust(top=0.4)
fig.subplots_adjust(right=0.9)
#fig.savefig('../plots/lgbm_cm.png', bbox_inches='tight', pad_inches=0.3)

***From what we analyzed by observing the evaluation metrics (confusion matrix & support metric), the class labels of outcome are imbalanced. This could be a major contributing factor in misclassification, and this is something we hope to fix in the next milestone.***