In [56]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

***First, add the dataset and drop the irrelevant columns***

In [25]:
df = pd.read_csv('../data/cases_train_processed.csv')

# Some preprocessing
df = df.drop(['latitude_x', 'longitude_x'], axis=1) # drop these columns with inconsistent/irrelevant data
df["date_confirmation"] = pd.to_datetime(df["date_confirmation"]).dt.strftime("%Y%m%d").astype(int) # convert date from object type to int type
df.head(5)

Unnamed: 0,age,sex,province,country,date_confirmation,additional_information,source,outcome,key,latitude_y,longitude_y,confirmed_sum,death_sum,recovered_sum,incidence_rate_avg,active_sum,Case-Fatality_Ratio
0,44.0,Unknown,Delhi,India,20200526,0,1,recovered,"Delhi, India",28.646519,77.10898,238828.0,4907.0,201671.0,1276.409575,32250.0,2.054617
1,44.0,Unknown,Uttar Pradesh,India,20200520,0,1,hospitalized,"Uttar Pradesh, India",26.925425,80.560982,342788.0,4869.0,270094.0,144.099577,67825.0,1.420411
2,44.0,Unknown,Maharashtra,India,20200526,0,1,hospitalized,"Maharashtra, India",19.449759,76.108221,1167496.0,31791.0,834432.0,948.072083,301273.0,2.723007
3,24.0,female,Baden-Wurttemberg,Germany,20200315,0,0,nonhospitalized,"Baden-Wurttemberg, Germany",48.6616,9.3501,46779.0,1871.0,41228.0,422.592353,3680.0,3.999658
4,44.0,Unknown,Gujarat,India,20200520,0,1,hospitalized,"Gujarat, India",22.694884,71.590923,120336.0,3286.0,100974.0,188.400627,16076.0,2.730687


## LightGBM Model


In [28]:
for c in df.columns:
    col_type = df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        df[c] = df[c].astype('category')

In [44]:
df.dtypes

age                        float64
sex                       category
province                  category
country                   category
date_confirmation            int64
additional_information       int64
source                       int64
outcome                   category
key                       category
latitude_y                 float64
longitude_y                float64
confirmed_sum              float64
death_sum                  float64
recovered_sum              float64
incidence_rate_avg         float64
active_sum                 float64
Case-Fatality_Ratio        float64
dtype: object

In [48]:
y = df['outcome']
X = df.drop(['outcome'], axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
training_data, validation_data, training_truth, validation_truth = train_test_split(X, y, train_size=0.8, test_size=0.20, random_state=11)
training_data.head(5)

Unnamed: 0,age,sex,province,country,date_confirmation,additional_information,source,key,latitude_y,longitude_y,confirmed_sum,death_sum,recovered_sum,incidence_rate_avg,active_sum,Case-Fatality_Ratio
15005,44.0,Unknown,Tamil Nadu,India,20200516,0,1,"Tamil Nadu, India",11.006091,78.400624,530908.0,8685.0,475717.0,682.039258,46506.0,1.635877
175052,44.0,Unknown,Maharashtra,India,20200522,0,1,"Maharashtra, India",19.449759,76.108221,1167496.0,31791.0,834432.0,948.072083,301273.0,2.723007
335676,36.0,female,Santa Cruz,Argentina,20201004,0,0,"Santa Cruz, Argentina",-38.4161,-63.6167,622934.0,12799.0,478077.0,1378.30134,132058.0,2.054632
292089,44.0,Unknown,Maharashtra,India,20200523,0,1,"Maharashtra, India",19.449759,76.108221,1167496.0,31791.0,834432.0,948.072083,301273.0,2.723007
311460,36.0,female,Bogota DC,Colombia,20200426,0,0,"Bogota DC, Colombia",5.349139,-74.190797,758398.0,24039.0,627685.0,1218.239013,106674.0,3.169708


In [49]:
clf = lgb.LGBMClassifier()
clf.fit(training_data, training_truth)

LGBMClassifier()

In [52]:
validation_prediction = clf.predict(validation_data)
training_prediction = clf.predict(training_data)

In [54]:
accuracy2 = accuracy_score(training_prediction, training_truth)
print('LightGBM Training accuracy score: {0:0.4f}'.format(accuracy_score(training_truth, training_prediction)))

LightGBM Model accuracy score: 0.8441


In [53]:
accuracy=accuracy_score(validation_prediction, validation_truth)
print('LightGBM Validation accuracy score: {0:0.4f}'.format(accuracy_score(validation_truth, validation_prediction)))

LightGBM Model accuracy score: 0.8414
