In [1]:
import numpy as np
from path import Path
import pandas as pd
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('Resources/Data/covid_case_data.csv')
df = pd.read_csv(file_path)[:-2]
df.head()

Unnamed: 0,id,death,date,sex,age_group,race_ethnicity,hospital,med_cond
0,607,0,2020-10-01,Female,30 - 39 Years,"White, Non-Hispanic",0,1
1,483,0,2020-10-01,Female,60 - 69 Years,"White, Non-Hispanic",0,1
2,488,0,2020-10-01,Female,40 - 49 Years,"White, Non-Hispanic",0,0
3,564,0,2020-10-01,Male,50 - 59 Years,Hispanic/Latino,0,1
4,6128,0,2020-10-01,Female,50 - 59 Years,Hispanic/Latino,0,1


In [4]:
df_2=df.drop(['id','date', 'hospital'], axis=1)
df_2.head()

Unnamed: 0,death,sex,age_group,race_ethnicity,med_cond
0,0,Female,30 - 39 Years,"White, Non-Hispanic",1
1,0,Female,60 - 69 Years,"White, Non-Hispanic",1
2,0,Female,40 - 49 Years,"White, Non-Hispanic",0
3,0,Male,50 - 59 Years,Hispanic/Latino,1
4,0,Female,50 - 59 Years,Hispanic/Latino,1


In [5]:
#Label Encoder
df_2_encoded = pd.get_dummies(df_2, columns=["sex", "age_group", "race_ethnicity"])
df_2_encoded.head()

Unnamed: 0,death,med_cond,sex_Female,sex_Male,sex_Other,age_group_0 - 9 Years,age_group_10 - 19 Years,age_group_20 - 29 Years,age_group_30 - 39 Years,age_group_40 - 49 Years,...,age_group_60 - 69 Years,age_group_70 - 79 Years,age_group_80+ Years,"race_ethnicity_American Indian/Alaska Native, Non-Hispanic","race_ethnicity_Asian, Non-Hispanic","race_ethnicity_Black, Non-Hispanic",race_ethnicity_Hispanic/Latino,"race_ethnicity_Multiple/Other, Non-Hispanic","race_ethnicity_Native Hawaiian/Other Pacific Islander, Non-Hispanic","race_ethnicity_White, Non-Hispanic"
0,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
#Separate the Features (X) from the Target (y)
y = df_2_encoded["death"]
X = df_2_encoded.drop(columns="death")

In [7]:
# Check the balance of our target values
y.value_counts()

0    1263089
1      43001
Name: death, dtype: int64

In [8]:
#Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
Counter(y_train)

Counter({0: 947316, 1: 32251})

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model = rf_model.fit(X_train, y_train)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual recover", "Actual death"], columns=["Predicted recover", "Predicted death"])

cm_df

Unnamed: 0,Predicted recover,Predicted death
Actual recover,315598,175
Actual death,10497,253


In [12]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted recover,Predicted death
Actual recover,315598,175
Actual death,10497,253


Accuracy Score : 0.5114903440718638
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      1.00      0.02      0.98      0.15      0.03    315773
          1       0.59      0.02      1.00      0.05      0.15      0.02     10750

avg / total       0.96      0.97      0.06      0.95      0.15      0.03    326523



In [13]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5340761068663309, 'age_group_80+ Years'),
 (0.11865599577350011, 'age_group_70 - 79 Years'),
 (0.11436017712339754, 'med_cond'),
 (0.02830408319937846, 'race_ethnicity_White, Non-Hispanic'),
 (0.024790499668427737, 'age_group_60 - 69 Years'),
 (0.024649791139355854, 'age_group_20 - 29 Years'),
 (0.021600836758174737, 'age_group_30 - 39 Years'),
 (0.021046429395097176, 'age_group_10 - 19 Years'),
 (0.021017197964154705, 'age_group_50 - 59 Years'),
 (0.020316710508936322, 'age_group_40 - 49 Years'),
 (0.014175088665419746, 'race_ethnicity_Asian, Non-Hispanic'),
 (0.012703707584350118, 'sex_Male'),
 (0.012466022998750291, 'sex_Female'),
 (0.009946620591482104, 'race_ethnicity_Hispanic/Latino'),
 (0.008684543627761531, 'age_group_0 - 9 Years'),
 (0.007834145938747216, 'race_ethnicity_Black, Non-Hispanic'),
 (0.0033157156645116813, 'race_ethnicity_Multiple/Other, Non-Hispanic'),
 (0.001034871830155369,
  'race_ethnicity_Native Hawaiian/Other Pacific Islander, Non-Hispanic'),
 (0.0009353

In [14]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model = ee_model.fit(X_train, y_train)

In [15]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual recover", "Actual death"], columns=["Predicted recover", "Predicted death"])

cm_df

Unnamed: 0,Predicted recover,Predicted death
Actual recover,257866,57907
Actual death,1331,9419


In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted recover,Predicted death
Actual recover,257866,57907
Actual death,1331,9419


Accuracy Score : 0.8464021567156095
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.82      0.88      0.90      0.85      0.71    315773
          1       0.14      0.88      0.82      0.24      0.85      0.72     10750

avg / total       0.97      0.82      0.87      0.88      0.85      0.71    326523

