In [1]:
import numpy as np
from path import Path
import pandas as pd
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('Resources/Data/covid_case_data.csv')
df = pd.read_csv(file_path)[:-2]
df.head()

Unnamed: 0,id,death,date,sex,age_group,race_ethnicity,hospital,med_cond
0,607,0,2020-10-01,Female,30 - 39 Years,"White, Non-Hispanic",0,1
1,483,0,2020-10-01,Female,60 - 69 Years,"White, Non-Hispanic",0,1
2,488,0,2020-10-01,Female,40 - 49 Years,"White, Non-Hispanic",0,0
3,564,0,2020-10-01,Male,50 - 59 Years,Hispanic/Latino,0,1
4,6128,0,2020-10-01,Female,50 - 59 Years,Hispanic/Latino,0,1


In [4]:
df_2=df.drop(['id','date'], axis=1)
df_2.head()

Unnamed: 0,death,sex,age_group,race_ethnicity,hospital,med_cond
0,0,Female,30 - 39 Years,"White, Non-Hispanic",0,1
1,0,Female,60 - 69 Years,"White, Non-Hispanic",0,1
2,0,Female,40 - 49 Years,"White, Non-Hispanic",0,0
3,0,Male,50 - 59 Years,Hispanic/Latino,0,1
4,0,Female,50 - 59 Years,Hispanic/Latino,0,1


In [5]:
#Label Encoder
df_2_encoded = pd.get_dummies(df_2, columns=["sex", "age_group", "race_ethnicity"])
df_2_encoded.head()

Unnamed: 0,death,hospital,med_cond,sex_Female,sex_Male,sex_Other,age_group_0 - 9 Years,age_group_10 - 19 Years,age_group_20 - 29 Years,age_group_30 - 39 Years,...,age_group_60 - 69 Years,age_group_70 - 79 Years,age_group_80+ Years,"race_ethnicity_American Indian/Alaska Native, Non-Hispanic","race_ethnicity_Asian, Non-Hispanic","race_ethnicity_Black, Non-Hispanic",race_ethnicity_Hispanic/Latino,"race_ethnicity_Multiple/Other, Non-Hispanic","race_ethnicity_Native Hawaiian/Other Pacific Islander, Non-Hispanic","race_ethnicity_White, Non-Hispanic"
0,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
#Separate the Features (X) from the Target (y)
y = df_2_encoded["death"]
X = df_2_encoded.drop(columns="death")

In [7]:
# Check the balance of our target values
y.value_counts()

0    1263089
1      43001
Name: death, dtype: int64

In [8]:
#Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
Counter(y_train)

Counter({0: 947316, 1: 32251})

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model = rf_model.fit(X_train, y_train)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual recover", "Actual death"], columns=["Predicted recover", "Predicted death"])

cm_df

Unnamed: 0,Predicted recover,Predicted death
Actual recover,313826,1947
Actual death,8184,2566


In [12]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted recover,Predicted death
Actual recover,313826,1947
Actual death,8184,2566


Accuracy Score : 0.6162659263841209
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.99      0.24      0.98      0.49      0.26    315773
          1       0.57      0.24      0.99      0.34      0.49      0.22     10750

avg / total       0.96      0.97      0.26      0.96      0.49      0.25    326523



In [13]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5683913782852699, 'hospital'),
 (0.21339478454829125, 'age_group_80+ Years'),
 (0.05345397539097045, 'med_cond'),
 (0.045042326539763276, 'age_group_70 - 79 Years'),
 (0.015117661603869945, 'race_ethnicity_White, Non-Hispanic'),
 (0.014228084312568225, 'age_group_30 - 39 Years'),
 (0.01313697741654295, 'age_group_60 - 69 Years'),
 (0.012714517344022703, 'age_group_20 - 29 Years'),
 (0.011905273783964342, 'age_group_40 - 49 Years'),
 (0.010657436725235534, 'age_group_50 - 59 Years'),
 (0.008394113995159607, 'age_group_10 - 19 Years'),
 (0.00665261717295539, 'race_ethnicity_Hispanic/Latino'),
 (0.006341011799251918, 'race_ethnicity_Asian, Non-Hispanic'),
 (0.005418749546089109, 'sex_Male'),
 (0.004774722382531126, 'sex_Female'),
 (0.0034955391570468163, 'age_group_0 - 9 Years'),
 (0.0034538566304913366, 'race_ethnicity_Black, Non-Hispanic'),
 (0.0017307299367817998, 'race_ethnicity_Multiple/Other, Non-Hispanic'),
 (0.0009094590543586346,
  'race_ethnicity_Native Hawaiian/Other Pacifi

In [15]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model = ee_model.fit(X_train, y_train)

In [16]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual recover", "Actual death"], columns=["Predicted recover", "Predicted death"])

cm_df

Unnamed: 0,Predicted recover,Predicted death
Actual recover,292317,23456
Actual death,1016,9734


In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted recover,Predicted death
Actual recover,292317,23456
Actual death,1016,9734


Accuracy Score : 0.9156035818783275
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.93      0.91      0.96      0.92      0.84    315773
          1       0.29      0.91      0.93      0.44      0.92      0.84     10750

avg / total       0.97      0.93      0.91      0.94      0.92      0.84    326523

