In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')



In [2]:
''' reading data '''
df = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/budi-23.appspot.com/o/dataset%2FCovid-19%20ICU%20Patients%20Analysis%20%26%20Visualization%2Fcovid.csv?alt=media&token=b216a797-2633-4e81-8bf1-4dea6ce290ab', parse_dates=[3, 4])

In [3]:
''' displaying first 5 rows of data '''
df.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,16169f,2,1,2020-04-05,2020-02-05,9999-99-99,97,2,27,97,2,2,2,2,2,2,2,2,2,2,2,1,97
1,1009bf,2,1,2020-03-19,2020-03-17,9999-99-99,97,2,24,97,2,2,2,2,2,2,2,2,2,2,99,1,97
2,167386,1,2,2020-06-04,2020-01-04,9999-99-99,2,2,54,2,2,2,2,2,2,2,2,1,2,2,99,1,2
3,0b5948,2,2,2020-04-17,2020-10-04,9999-99-99,2,1,30,97,2,2,2,2,2,2,2,2,2,2,99,1,2
4,0d01b5,1,2,2020-04-13,2020-04-13,22-04-2020,2,2,60,2,1,2,2,2,1,2,1,2,2,2,99,1,2


In [4]:
''' shape of data '''
df.shape

(566602, 23)

In [5]:
''' checking null valeus '''
df.isnull().sum()

id                     0
sex                    0
patient_type           0
entry_date             0
date_symptoms          0
date_died              0
intubed                0
pneumonia              0
age                    0
pregnancy              0
diabetes               0
copd                   0
asthma                 0
inmsupr                0
hypertension           0
other_disease          0
cardiovascular         0
obesity                0
renal_chronic          0
tobacco                0
contact_other_covid    0
covid_res              0
icu                    0
dtype: int64

In [6]:
''' dropping unwanted rows '''

df.drop(['id','patient_type','pregnancy','contact_other_covid','other_disease'],inplace=True,axis=1)

In [7]:
''' creating new column '''

df['new_column'] = (df['entry_date'] - df['date_symptoms']).dt.days
df['dead']='1'
df.loc[df['date_died'] == '9999-99-99','dead']='0'

''' dropping columns '''
df.drop(['entry_date','date_symptoms','date_died'],inplace=True,axis=1)

''' changing datatype '''
df = df.astype('int8')

In [8]:
''' dropping null values '''

df = df.loc[(df['sex'] <= 2) & (df['intubed'] <= 2) & (df['pneumonia'] <= 2) & (df['diabetes'] <= 2) & 
            (df['copd'] <= 2) & (df['asthma'] <= 2) & (df['inmsupr'] <= 2) & (df['hypertension'] <= 2) & 
            (df['cardiovascular'] <= 2) & (df['obesity'] <= 2) & (df['renal_chronic'] <= 2) & (df['tobacco'] <= 2) &
            (df['covid_res'] <= 2) & (df['dead'] <= 2) & (df['icu'] <= 2) & (df['new_column'] >= 0), 
            ['sex','age','inmsupr','pneumonia','diabetes','asthma','copd','hypertension','cardiovascular',
             'renal_chronic','obesity','tobacco','new_column','covid_res','intubed','icu','dead']]

In [9]:
''' changing values of NO from  2 to 0'''

df.loc[df['sex']==2,'sex']=0

df.loc[df['inmsupr']==2,'inmsupr']=0

df.loc[df['pneumonia']==2,'pneumonia']=0

df.loc[df['diabetes']==2,'diabetes']=0

df.loc[df['asthma']==2,'asthma']=0

df.loc[df['copd']==2,'copd']=0

df.loc[df['hypertension']==2,'hypertension']=0

df.loc[df['cardiovascular']==2,'cardiovascular']=0

df.loc[df['renal_chronic']==2,'renal_chronic']=0

df.loc[df['obesity']==2,'obesity']=0

df.loc[df['tobacco']==2,'tobacco']=0

df.loc[df['intubed']==2,'intubed']=0

df.loc[df['icu']==2,'icu']=0

df.loc[df['covid_res']==2,'covid_res']=0

In [10]:
''' new df looks like '''
df.head()

Unnamed: 0,sex,age,inmsupr,pneumonia,diabetes,asthma,copd,hypertension,cardiovascular,renal_chronic,obesity,tobacco,new_column,covid_res,intubed,icu,dead
3,0,30,0,1,0,0,0,0,0,0,0,0,86,1,0,0,0
4,1,60,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1
5,0,47,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1
6,0,63,0,0,0,0,0,1,0,0,0,0,9,1,0,0,0
9,1,39,0,1,0,0,0,0,0,0,1,0,5,1,1,0,0


In [11]:
''' considering only patients who are tested positive to covid '''
df = df.loc[df['covid_res'] == 1, ['sex','age','inmsupr','pneumonia','diabetes','asthma','copd','hypertension',
                                   'cardiovascular','renal_chronic','obesity','tobacco','new_column',
                                   'covid_res','intubed','icu','dead']]

In [12]:
''' independent and dependent features '''
X = df.drop(['icu','covid_res','dead','intubed'], axis=1)
y = df['icu']

In [13]:
''' value counts of y '''
y.value_counts()

0    49279
1     4276
Name: icu, dtype: int64

In [14]:
''' SMOTE to overcome the situation of imbalanced data '''
smote = SMOTE(random_state=42)

''' fit on X and y'''
X_smote, y_smote = smote.fit_resample(X, y)

In [15]:
''' again value counts of y '''
y_smote.value_counts()

AttributeError: ignored

In [16]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.25, random_state=0)

In [17]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

X_train shape:  (73918, 13)
X_test shape:  (24640, 13)


In [18]:
def evaluation_fun():
    print("train Accuracy = {}".format(accuracy_score(y_train, model.predict(X_train))))
    print("test Accuracy = {}".format(accuracy_score(y_test, results)))
    print("Confusion Matrix")
    print(confusion_matrix(y_test, results))
    print("Classification Report")
    print(classification_report(y_test, results))

In [19]:
''' Random Forest '''
model = RandomForestClassifier()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [20]:
''' results of rf '''
evaluation_fun()

train Accuracy = 0.8973862929191807
test Accuracy = 0.833887987012987
Confusion Matrix
[[ 9458  2761]
 [ 1332 11089]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.77      0.82     12219
           1       0.80      0.89      0.84     12421

    accuracy                           0.83     24640
   macro avg       0.84      0.83      0.83     24640
weighted avg       0.84      0.83      0.83     24640



In [21]:
''' xgboost '''
model = XGBClassifier()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [22]:
evaluation_fun()

train Accuracy = 0.6994236857057821
test Accuracy = 0.7015016233766234
Confusion Matrix
[[8230 3989]
 [3366 9055]]
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.67      0.69     12219
           1       0.69      0.73      0.71     12421

    accuracy                           0.70     24640
   macro avg       0.70      0.70      0.70     24640
weighted avg       0.70      0.70      0.70     24640



In [23]:
''' KNN without hyperparameter tunning '''
model = KNeighborsClassifier()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [24]:
evaluation_fun()

train Accuracy = 0.820612029546254
test Accuracy = 0.7657061688311688
Confusion Matrix
[[ 7799  4420]
 [ 1353 11068]]
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.64      0.73     12219
           1       0.71      0.89      0.79     12421

    accuracy                           0.77     24640
   macro avg       0.78      0.76      0.76     24640
weighted avg       0.78      0.77      0.76     24640



In [25]:
''' ADA BOOST without hyper parameter tunning'''
model = AdaBoostClassifier()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [26]:
evaluation_fun()

train Accuracy = 0.6859222381557942
test Accuracy = 0.6905438311688312
Confusion Matrix
[[7973 4246]
 [3379 9042]]
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.65      0.68     12219
           1       0.68      0.73      0.70     12421

    accuracy                           0.69     24640
   macro avg       0.69      0.69      0.69     24640
weighted avg       0.69      0.69      0.69     24640



In [27]:
''' Gradient Boost with hyperparameter tunnig '''
model = GradientBoostingClassifier(max_features='auto', loss='deviance',learning_rate=0.3, 
                                   max_depth=8,min_samples_leaf=3,min_samples_split=0.1, n_estimators=400, subsample=0.4)

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [28]:
evaluation_fun()

train Accuracy = 0.721867474769339
test Accuracy = 0.7153814935064935
Confusion Matrix
[[8470 3749]
 [3264 9157]]
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.69      0.71     12219
           1       0.71      0.74      0.72     12421

    accuracy                           0.72     24640
   macro avg       0.72      0.72      0.72     24640
weighted avg       0.72      0.72      0.72     24640



In [29]:
''' Gradient Boosting without hyper parameter tunning '''
model = GradientBoostingClassifier()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [30]:
evaluation_fun()

train Accuracy = 0.6997348413106416
test Accuracy = 0.7023944805194805
Confusion Matrix
[[8266 3953]
 [3380 9041]]
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.68      0.69     12219
           1       0.70      0.73      0.71     12421

    accuracy                           0.70     24640
   macro avg       0.70      0.70      0.70     24640
weighted avg       0.70      0.70      0.70     24640



In [31]:
''' KNN with hyper parameter tunning '''
model = KNeighborsClassifier(n_neighbors=5,weights='distance',p=1,metric='minkowski')

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [32]:
evaluation_fun()

train Accuracy = 0.8881192672961931
test Accuracy = 0.7883522727272727
Confusion Matrix
[[ 8443  3776]
 [ 1439 10982]]
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.69      0.76     12219
           1       0.74      0.88      0.81     12421

    accuracy                           0.79     24640
   macro avg       0.80      0.79      0.79     24640
weighted avg       0.80      0.79      0.79     24640



In [33]:
''' ADA Boost with hyper parameter tunning '''
model = AdaBoostClassifier(n_estimators= 9000)

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [34]:
evaluation_fun()

train Accuracy = 0.6934035011769799
test Accuracy = 0.6976055194805195
Confusion Matrix
[[8011 4208]
 [3243 9178]]
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.66      0.68     12219
           1       0.69      0.74      0.71     12421

    accuracy                           0.70     24640
   macro avg       0.70      0.70      0.70     24640
weighted avg       0.70      0.70      0.70     24640



In [35]:
''' SVM '''
model = SVC()

''' fit on data '''
model.fit(X_train, y_train)

''' prediction '''
results = model.predict(X_test)

In [36]:
evaluation_fun()

train Accuracy = 0.6654536107578668
test Accuracy = 0.6699269480519481
Confusion Matrix
[[6951 5268]
 [2865 9556]]
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.57      0.63     12219
           1       0.64      0.77      0.70     12421

    accuracy                           0.67     24640
   macro avg       0.68      0.67      0.67     24640
weighted avg       0.68      0.67      0.67     24640

