In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import sklearn
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import f1_score as fs
from sklearn.metrics import log_loss
encoder = preprocessing.LabelEncoder()

In [2]:
tdata = pd.read_csv('train.csv')
tdata = tdata.drop('id',axis=1)
tdata = tdata.fillna(np.nan,axis=0)
tdata['age'] = tdata['age'].fillna(value=tdata['age'].mean())
tdata['location'] = encoder.fit_transform(tdata['location'].astype(str))
tdata['country'] = encoder.fit_transform(tdata['country'].astype(str))
tdata['gender'] = encoder.fit_transform(tdata['gender'].astype(str))
tdata[['symptom1']] = encoder.fit_transform(tdata['symptom1'].astype(str))
tdata[['symptom2']] = encoder.fit_transform(tdata['symptom2'].astype(str))
tdata[['symptom3']] = encoder.fit_transform(tdata['symptom3'].astype(str))
tdata[['symptom4']] = encoder.fit_transform(tdata['symptom4'].astype(str))
tdata[['symptom5']] = encoder.fit_transform(tdata['symptom5'].astype(str))
tdata[['symptom6']] = encoder.fit_transform(tdata['symptom6'].astype(str))

In [3]:
tdata.head(100)

Unnamed: 0,location,country,gender,age,sym_on,hosp_vis,vis_wuhan,from_wuhan,death,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6
0,38,2,1,61.0,12/20/2019,12/27/2019,0,1,1,9,9,11,0,0,0
1,38,2,1,69.0,12/30/2019,1/3/2020,0,1,1,9,9,11,0,0,0
2,38,2,1,89.0,,,0,1,1,9,9,11,0,0,0
3,38,2,1,89.0,1/13/2020,1/13/2020,0,1,1,9,9,11,0,0,0
4,38,2,1,66.0,1/10/2020,1/16/2020,0,1,1,9,9,11,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,27,12,0,37.0,1/26/2020,1/29/2020,0,1,0,9,9,11,0,0,0
96,27,12,0,73.0,,,0,1,0,9,9,11,0,0,0
97,27,12,1,31.0,1/28/2020,1/30/2020,1,0,0,9,9,11,0,0,0
98,27,12,0,47.0,1/30/2020,1/30/2020,1,0,0,5,9,11,0,0,0


In [4]:
tdata['sym_on'] = pd.to_datetime(tdata['sym_on'])
tdata['hosp_vis'] = pd.to_datetime(tdata['hosp_vis'])
tdata['sym_on']= tdata['sym_on'].map(dt.datetime.toordinal)
tdata['hosp_vis']= tdata['hosp_vis'].map(dt.datetime.toordinal)
tdata['diff_sym_hos']= tdata['hosp_vis'] - tdata['sym_on']

In [5]:
tdata = tdata.drop(['sym_on','hosp_vis'],axis=1)

In [6]:
tdata.head()

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,death,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos
0,38,2,1,61.0,0,1,1,9,9,11,0,0,0,7
1,38,2,1,69.0,0,1,1,9,9,11,0,0,0,4
2,38,2,1,89.0,0,1,1,9,9,11,0,0,0,0
3,38,2,1,89.0,0,1,1,9,9,11,0,0,0,0
4,38,2,1,66.0,0,1,1,9,9,11,0,0,0,6


In [7]:
print(tdata.isna().sum())


location        0
country         0
gender          0
age             0
vis_wuhan       0
from_wuhan      0
death           0
symptom1        0
symptom2        0
symptom3        0
symptom4        0
symptom5        0
symptom6        0
diff_sym_hos    0
dtype: int64


In [8]:
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import f1_score as fs
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.metrics import confusion_matrix as cm

In [9]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
#classifier = AdaBoostClassifier(rf,50,0.01,'SAMME.R',10)

In [10]:
X = tdata[['location','country','gender','age','vis_wuhan','from_wuhan','symptom1','symptom2','symptom3','symptom4','symptom5','symptom6','diff_sym_hos']]
Y = tdata['death']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
pred = rf.predict(X_test)

recall = rs(Y_test,pred)
precision = ps(Y_test,pred)
f1 = fs(Y_test,pred)
accuracy = rf.score(X_test,Y_test)

In [18]:
print('*** Evaluation metrics for test dataset ***\n')
print('Recall Score: ',recall)
print('Precision Score: ',precision)
print('F1 Score: ',f1)
print('Accuracy: ',accuracy)

*** Evaluation metrics for test dataset ***

Recall Score:  0.6666666666666666
Precision Score:  1.0
F1 Score:  0.8
Accuracy:  0.9111111111111111
