In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Warning this is a big dataset and itll take a while

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [5]:
columns = ['res_state','res_county','age_group','sex','race','ethnicity','hosp_yn']

target = ['hosp_yn']

In [6]:
df = pd.read_csv('virginia_covid.csv')
df.head()

Unnamed: 0,res_state,res_county,age_group,sex,race,ethnicity,hosp_yn
0,VA,NORFOLK CITY,18 to 49 years,Female,Multiple/Other,Hispanic/Latino,No
1,VA,FREDERICK,18 to 49 years,Female,White,Hispanic/Latino,No
2,VA,FAIRFAX,18 to 49 years,Male,Black,Hispanic/Latino,No
3,VA,CULPEPER,18 to 49 years,Male,White,Hispanic/Latino,No
4,VA,ALBEMARLE,18 to 49 years,Male,White,Hispanic/Latino,No


In [7]:
# Split the data 

In [8]:
# Create our features, and drop the laon status and get dummies
X = df.drop('hosp_yn', axis=1)
X = pd.get_dummies(X)

# Create our target
y = df['hosp_yn']

In [9]:
y.value_counts()

No     244784
Yes     18054
Name: hosp_yn, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

Random_forest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
Random_forest = Random_forest.fit(X_train, y_train)

In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = Random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7498587026640607

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[47726, 13475],
       [ 1263,  3246]], dtype=int64)

In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.97      0.78      0.72      0.87      0.75      0.56     61201
        Yes       0.19      0.72      0.78      0.31      0.75      0.56      4509

avg / total       0.92      0.78      0.72      0.83      0.75      0.56     65710



In [16]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(Random_forest.feature_importances_, feature_names), reverse=True)

[(0.3352667779766742, 'age_group_65+ years'),
 (0.2686010014279663, 'age_group_18 to 49 years'),
 (0.11089467545366466, 'age_group_0 - 17 years'),
 (0.06740097228690599, 'age_group_50 to 64 years'),
 (0.01956831312723142, 'race_Black'),
 (0.01566076556824353, 'race_White'),
 (0.008024519565596328, 'sex_Male'),
 (0.007892213131191783, 'sex_Female'),
 (0.007154657038938565, 'ethnicity_Hispanic/Latino'),
 (0.0065418065346626525, 'res_county_FAIRFAX'),
 (0.0058731581685863895, 'ethnicity_Non-Hispanic/Latino'),
 (0.005312135979755417, 'res_county_PORTSMOUTH CITY'),
 (0.004681435359065472, 'res_county_NORFOLK CITY'),
 (0.0046325631725234, 'res_county_MONTGOMERY'),
 (0.0045884885456824185, 'res_county_PRINCE WILLIAM'),
 (0.00403897922176762, 'res_county_LOUDOUN'),
 (0.004010094525307934, 'res_county_VIRGINIA BEACH CITY'),
 (0.003699432056188894, 'race_Multiple/Other'),
 (0.0035073526046306707, 'res_county_ALEXANDRIA CITY'),
 (0.003418924595114255, 'res_county_CHESTERFIELD'),
 (0.0033008164819

In [17]:
# easy ensemble adaboost classifer 

In [19]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [20]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7518190146506658

In [21]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[47233, 13968],
       [ 1209,  3300]], dtype=int64)

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.98      0.77      0.73      0.86      0.75      0.57     61201
        Yes       0.19      0.73      0.77      0.30      0.75      0.56      4509

avg / total       0.92      0.77      0.73      0.82      0.75      0.57     65710

