In [1]:
# Tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Imputation Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Machine Learning Models for Prediction and Classification
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import neighbors
from sklearn.cluster import KMeans

# Ranking, Scoring and error metrics.
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve, auc


In [2]:
# Reading the dataset
df = pd.read_csv('dataset_master.csv')
df.head()

Unnamed: 0,FIPS,SVI,VulnerabilityClass,RankPoverty,RankUnemployment,RankIncome,RankHSDiploma,VulnerabilityHome,RankSeniors,RankMinors,...,UnemploymentRate2020,MedianIncome2019,Date,Cases,State,County,Deaths,%_Vaccinated,Vaccinations,Day
0,1001,0.4354,0.3631,0.5401,0.2745,0.286,0.4397,0.581,0.185,0.7529,...,4.9,58233.0,2020-12-13,3233,AL,Autauga,41,0.0,0.0,0
1,1003,0.2162,0.2232,0.2239,0.3121,0.2057,0.3209,0.199,0.6428,0.4323,...,5.6,59871.0,2020-12-13,10489,AL,Baldwin,141,0.0,0.0,0
2,1005,0.9959,0.978,0.9631,0.9217,0.9481,0.9701,0.9153,0.4893,0.3327,...,7.0,35972.0,2020-12-13,1264,AL,Barbour,30,0.0,0.0,0
3,1007,0.6003,0.7694,0.443,0.7895,0.8987,0.7351,0.1203,0.32,0.2846,...,6.6,47918.0,2020-12-13,1398,AL,Bibb,39,0.0,0.0,0
4,1009,0.4242,0.6143,0.4723,0.2611,0.7561,0.8405,0.3187,0.4715,0.6406,...,4.1,52902.0,2020-12-13,3663,AL,Blount,47,0.0,0.0,0


In [3]:
# Listing out the columns for feature selection
df.columns

Index(['FIPS', 'SVI', 'VulnerabilityClass', 'RankPoverty', 'RankUnemployment',
       'RankIncome', 'RankHSDiploma', 'VulnerabilityHome', 'RankSeniors',
       'RankMinors', 'RankDisabilities', 'RankSingleParent',
       'VulnerabilityMinority', 'RankNonWhite', 'RankLimitedEnglish',
       'VulnerabilityHousing', 'RankMultiUnit', 'RankMobileHomes',
       'RankOvercrowding', 'RankNoVehicle', 'RankGroupHome',
       '%_HesitantUnsure', '%_Hesitant', '%_StronglyHesitant', '%_Hispanic',
       '%_Native', '%_Asian', '%_Black', '%_Islander', '%_White', 'VotesGOP',
       'VotesDEM', 'TotalVotes', '%_GOP', '%_DEM', 'OCRI', 'CEI', 'HSI',
       'LGRV', 'TotalWorkforce', 'Employed2020', 'Unemployed2020',
       'UnemploymentRate2020', 'MedianIncome2019', 'Date', 'Cases', 'State',
       'County', 'Deaths', '%_Vaccinated', 'Vaccinations', 'Day'],
      dtype='object')

In [4]:
# Checking for missing data.
df.isna().sum()

FIPS                     0
SVI                      0
VulnerabilityClass       0
RankPoverty              0
RankUnemployment         0
RankIncome               0
RankHSDiploma            0
VulnerabilityHome        0
RankSeniors              0
RankMinors               0
RankDisabilities         0
RankSingleParent         0
VulnerabilityMinority    0
RankNonWhite             0
RankLimitedEnglish       0
VulnerabilityHousing     0
RankMultiUnit            0
RankMobileHomes          0
RankOvercrowding         0
RankNoVehicle            0
RankGroupHome            0
%_HesitantUnsure         0
%_Hesitant               0
%_StronglyHesitant       0
%_Hispanic               0
%_Native                 0
%_Asian                  0
%_Black                  0
%_Islander               0
%_White                  0
VotesGOP                 0
VotesDEM                 0
TotalVotes               0
%_GOP                    0
%_DEM                    0
OCRI                     0
CEI                      0
H

In [5]:
cases_features = ['RankMobileHomes', 'RankMultiUnit', 'RankLimitedEnglish', 'VulnerabilityHousing', 'RankLimitedEnglish',
                 'RankNonWhite', 'RankOvercrowding', 'RankNoVehicle', 'RankGroupHome', 'RankDisabilities', 'RankHSDiploma', 'RankIncome',
                 'RankMinors', 'VotesGOP', 'VotesDEM', 'LGRV', 'MedianIncome2019',
                 '%_HesitantUnsure', '%_Hesitant', '%_StronglyHesitant', '%_Hispanic', '%_Native', '%_Asian', 
                  '%_Black', '%_Islander', '%_White', 'Unemployed2020', 'HSI', 'Vaccinations']


In [6]:
#Splitting the dataframe into training and testing dataframes.
training_cases = df[cases_features].copy()
testing_cases = df['Cases'].copy()

In [7]:
training_cases.head()

Unnamed: 0,RankMobileHomes,RankMultiUnit,RankLimitedEnglish,VulnerabilityHousing,RankLimitedEnglish.1,RankNonWhite,RankOvercrowding,RankNoVehicle,RankGroupHome,RankDisabilities,...,%_StronglyHesitant,%_Hispanic,%_Native,%_Asian,%_Black,%_Islander,%_White,Unemployed2020,HSI,Vaccinations
0,0.7408,0.6017,0.5113,0.3741,0.5113,0.6336,0.2964,0.4846,0.1525,0.7905,...,0.1272,0.0283,0.0025,0.0103,0.19,0.0001,0.746,1262.0,0.635086,0.0
1,0.5339,0.9713,0.3582,0.3359,0.3582,0.5158,0.2604,0.1328,0.3018,0.3524,...,0.1169,0.0456,0.0065,0.0092,0.0917,0.0,0.8307,5425.0,0.660598,0.0
2,0.928,0.2416,0.7052,0.9889,0.7052,0.8965,0.8198,0.8685,0.9449,0.9064,...,0.1274,0.0436,0.0029,0.0048,0.4744,0.0,0.4581,605.0,0.342439,0.0
3,0.9207,0.4317,0.227,0.7189,0.227,0.639,0.0981,0.5441,0.9214,0.6074,...,0.139,0.0257,0.0013,0.0012,0.2214,0.0,0.7453,573.0,0.622412,0.0
4,0.8816,0.1512,0.717,0.1741,0.717,0.4206,0.3703,0.242,0.1165,0.3763,...,0.1416,0.0926,0.0007,0.0037,0.0153,0.0004,0.8689,1008.0,0.671843,0.0


In [8]:
testing_cases.head()

0     3233
1    10489
2     1264
3     1398
4     3663
Name: Cases, dtype: int64

In [9]:
# Data preparation (train-test-split)
X_train, X_test, y_train, y_test = train_test_split(training_cases, testing_cases, test_size=0.2)

In [10]:
# Baseline model (Linear Regression) - Count level COVID-19 Cases.
linear_regression_cases = LinearRegression()
linear_regression_cases.fit(X_train, y_train)
y_pred = linear_regression_cases.predict(X_test)
lrc_r2 = r2_score(y_test, y_pred)
print("Linear Regression Model's Score (R2): ", lrc_r2)

Linear Regression Model's Score (R2):  0.9433042701941339


In [11]:
# Advanced model - 1 

# Model Hyperparameters
number_of_neighbors_val = 5
leaf_size_val = 5

for i in range(10):
    knn_cases = neighbors.KNeighborsRegressor(n_neighbors = number_of_neighbors_val, leaf_size = leaf_size_val)
    knn_cases.fit(X_train, y_train)
    knn_pred = knn_cases.predict(X_test)
    knn_error = mean_squared_error(y_test, knn_pred, squared = False)
    print('------ Accuracy & Error ------ for N: ', number_of_neighbors_val, " & L: ", leaf_size_val)
    print(r2_score(y_test, knn_pred),"  &  ", knn_error)
    print('------------------------------')
    leaf_size_val = leaf_size_val + 5
    number_of_neighbors_val = number_of_neighbors_val + 1
    
    


------ Accuracy & Error ------ for N:  5  & L:  5
0.9943575626263759   &   2648.2781971940385
------------------------------
------ Accuracy & Error ------ for N:  6  & L:  10
0.9944032217157861   &   2637.5413845896855
------------------------------
------ Accuracy & Error ------ for N:  7  & L:  15
0.9944811048842102   &   2619.1254560631264
------------------------------
------ Accuracy & Error ------ for N:  8  & L:  20
0.9945385183174502   &   2605.4663708691605
------------------------------
------ Accuracy & Error ------ for N:  9  & L:  25
0.9943989767996203   &   2638.541425983969
------------------------------
------ Accuracy & Error ------ for N:  10  & L:  30
0.9941054137723826   &   2706.8045252555276
------------------------------
------ Accuracy & Error ------ for N:  11  & L:  35
0.9940416269972537   &   2721.410618084209
------------------------------
------ Accuracy & Error ------ for N:  12  & L:  40
0.9938716525150924   &   2759.9545004885995
-----------------------

In [15]:
# Advanced model - 2

# Model Hyperparameters

N, M = 15, 13
for i in range(10):
    
    rf_cases = RandomForestRegressor(n_estimators = N, max_depth = M)
    rf_cases.fit(X_train, y_train)
    rf_pred = rf_cases.predict(X_test)

    rf_error = mean_squared_error(y_test, rf_pred, squared = False)
    print('------ Accuracy & Error ------ for N: ', N, " & M: ", M)
    print(r2_score(y_test, rf_pred),"  &  ", rf_error)
    print('------------------------------')
    N = N + 3
    M = M + 1


------ Accuracy & Error ------ for N:  15  & M:  13
0.9946071942928899   &   2589.033191891876
------------------------------
------ Accuracy & Error ------ for N:  18  & M:  14
0.9948246835138399   &   2536.2887072121116
------------------------------
------ Accuracy & Error ------ for N:  21  & M:  15
0.994964642112095   &   2501.7586096967643
------------------------------
------ Accuracy & Error ------ for N:  24  & M:  16
0.995037265062624   &   2483.652155723929
------------------------------
------ Accuracy & Error ------ for N:  27  & M:  17
0.9950904698452918   &   2470.302837700064
------------------------------
------ Accuracy & Error ------ for N:  30  & M:  18
0.9951106471086836   &   2465.221366835189
------------------------------
------ Accuracy & Error ------ for N:  33  & M:  19
0.9951329006622072   &   2459.604825942835
------------------------------
------ Accuracy & Error ------ for N:  36  & M:  20
0.9951272790114783   &   2461.024875950277
-----------------------