In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb

In [2]:
vaccine_data = pd.read_csv('data/COVID-19_Vaccinations_in_the_United_States_Jurisdiction_20240111.csv')
death_data = pd.read_csv('data/deaths_with_abbs.csv')

In [3]:
vaccine_data['Date'] = pd.DatetimeIndex(vaccine_data['Date'])
vaccine_data['year'] = pd.DatetimeIndex(vaccine_data['Date']).year
vaccine_data['month'] = pd.DatetimeIndex(vaccine_data['Date']).month

## change from states to regions

In [4]:
east = [
'CT',
'MA',
'ME',
'NH',
'NJ',
'NY',
'PA',
'RI',
'VT'
]
midwest = [
'IA',
'IL',
'IN',
'KS',
'MI',
'MN',
'MO',
'ND',
'NE',
'OH',
'SD',
'WI'
]
south = [
'AL',
'AR',
'DC',
'DE',
'FL',
'GA',
'KY',
'LA',
'MD',
'MS',
'NC',
'OK',
'SC',
'TN',
'TX',
'VA',
'WV'
]
west = [
'AK',
'AZ',
'CA',
'CO',
'HI',
'ID',
'MT',
'NM',
'NV',
'OR',
'UT',
'WA',
'WY'
]
pr = ['PR']

In [5]:
vaccine_data["region"] = vaccine_data["Location"].apply(lambda x: "East" if x in east else
                                       ("Midwest" if x in midwest else
                                        ("South" if x in south else
                                         ("West" if x in west else 
                                          "PR" if x in pr else 'Other'))))

In [6]:
vaccine_data = vaccine_data[vaccine_data.Location != 'US']
vaccine_data = vaccine_data[vaccine_data.region != 'Other']

## change week to season

In [7]:
fall = [9, 10, 11]
winter = [12, 1, 2]
spring = [3, 4, 5]
summer = [6, 7, 8]

In [8]:
vaccine_data["season"] = vaccine_data["month"].apply(lambda x: "fall" if x in fall else
                                       ("winter" if x in winter else
                                        ("spring" if x in spring else
                                         ("summer" if x in summer else 'Other'))))

## merge datasets

In [9]:
death_data = death_data[death_data.State != "United States"]

In [10]:
east = [
'CT',
'MA',
'ME',
'NH',
'NJ',
'NY', 'New York City',
'PA',
'RI',
'VT'
]
midwest = [
'IA',
'IL',
'IN',
'KS',
'MI',
'MN',
'MO',
'ND',
'NE',
'OH',
'SD',
'WI'
]
south = [
'AL',
'AR',
'DC', 'District of Columbia',
'DE',
'FL',
'GA',
'KY',
'LA',
'MD',
'MS',
'NC',
'OK',
'SC',
'TN',
'TX',
'VA',
'WV'
]
west = [
'AK',
'AZ',
'CA',
'CO',
'HI',
'ID',
'MT',
'NM',
'NV',
'OR',
'UT',
'WA',
'WY'
]

In [11]:
death_data["region"] = death_data["Location"].apply(lambda x: "East" if x in east else
                                       ("Midwest" if x in midwest else
                                        ("South" if x in south else
                                         ("West" if x in west else 'Other'))))

In [12]:
death_data.loc[death_data.State == "District of Columbia", 'region'] = "South"
death_data.loc[death_data.State == "New York City", 'region'] = "East"
death_data.loc[death_data.State == "Puerto Rico", 'region'] = "PR"

In [13]:
death_data = death_data[death_data["Year"].str.contains('/') == False]
death_data['Year'] = death_data['Year'].astype(int)
death_data.head()

Unnamed: 0,Data as of,Start Date,End Date,Group,Year,Month,MMWR Week,Week Ending Date,State,COVID-19 Deaths,Total Deaths,Percent of Expected Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote,Location,region
211,01/11/2024,01/05/2020,01/11/2020,By Week,2020,,2.0,01/11/2020,Alabama,0.0,1127.0,96.0,79.0,0.0,10.0,89.0,,AL,South
212,01/11/2024,01/12/2020,01/18/2020,By Week,2020,,3.0,01/18/2020,Alabama,0.0,1039.0,91.0,62.0,0.0,,65.0,One or more data cells have counts between 1-9...,AL,South
213,01/11/2024,01/19/2020,01/25/2020,By Week,2020,,4.0,01/25/2020,Alabama,,1056.0,94.0,62.0,0.0,,70.0,One or more data cells have counts between 1-9...,AL,South
214,01/11/2024,01/26/2020,02/01/2020,By Week,2020,,5.0,02/01/2020,Alabama,0.0,1026.0,94.0,56.0,0.0,14.0,70.0,,AL,South
215,01/11/2024,02/02/2020,02/08/2020,By Week,2020,,6.0,02/08/2020,Alabama,0.0,1120.0,101.0,61.0,0.0,10.0,71.0,,AL,South


In [14]:
death_data.loc[death_data.State == "Puerto Rico", 'Location'] = "PR"
death_data.loc[death_data.State == "District of Columbia", 'Location'] = "DC"
death_data.loc[death_data.State == "New York City", 'Location'] = "NY"

In [15]:
combined_data = vaccine_data.merge(death_data, left_on = ['MMWR_week', 'Location', 'year'], 
                                   right_on = ['MMWR Week', 'Location', 'Year'],
                                   how = 'left')

In [16]:
covid_deaths = combined_data[~combined_data['COVID-19 Deaths'].isna()]
covid_deaths.shape

(27091, 131)

In [17]:
list(covid_deaths.columns)

['Date',
 'MMWR_week',
 'Location',
 'Distributed',
 'Distributed_Janssen',
 'Distributed_Moderna',
 'Distributed_Pfizer',
 'Distributed_Novavax',
 'Distributed_Unk_Manuf',
 'Dist_Per_100K',
 'Distributed_Per_100k_5Plus',
 'Distributed_Per_100k_12Plus',
 'Distributed_Per_100k_18Plus',
 'Distributed_Per_100k_65Plus',
 'Administered',
 'Administered_5Plus',
 'Administered_12Plus',
 'Administered_18Plus',
 'Administered_65Plus',
 'Administered_Janssen',
 'Administered_Moderna',
 'Administered_Pfizer',
 'Administered_Novavax',
 'Administered_Unk_Manuf',
 'Admin_Per_100K',
 'Admin_Per_100k_5Plus',
 'Admin_Per_100k_12Plus',
 'Admin_Per_100k_18Plus',
 'Admin_Per_100k_65Plus',
 'Recip_Administered',
 'Administered_Dose1_Recip',
 'Administered_Dose1_Pop_Pct',
 'Administered_Dose1_Recip_5Plus',
 'Administered_Dose1_Recip_5PlusPop_Pct',
 'Administered_Dose1_Recip_12Plus',
 'Administered_Dose1_Recip_12PlusPop_Pct',
 'Administered_Dose1_Recip_18Plus',
 'Administered_Dose1_Recip_18PlusPop_Pct',
 'Ad

In [18]:
covid_deaths = covid_deaths.drop(['Data as of',
 'Start Date',
 'End Date',
 'Group',
 'Year',
 'Month',
 'MMWR Week',
 'Week Ending Date',
 'State', 
 'Total Deaths',
 'Percent of Expected Deaths',
 'Pneumonia Deaths',
 'Pneumonia and COVID-19 Deaths',
 'Influenza Deaths',
 'Pneumonia, Influenza, or COVID-19 Deaths',
 'Footnote', 'region_y'], axis=1)

In [19]:
covid_deaths = covid_deaths.rename(columns={'COVID-19 Deaths': 'covid19_deaths'})

In [20]:
covid_deaths = pd.get_dummies(covid_deaths, columns=['region_x', 'season'])

In [21]:
correlations = covid_deaths.corrwith(covid_deaths["covid19_deaths"])
correlations_df = pd.DataFrame({'feature': correlations.index, 'correlation': correlations.values})
correlations_df.sort_values(by = ['correlation'], ascending = False)

Unnamed: 0,feature,correlation
109,covid19_deaths,1.000000
71,Additional_Doses_5Plus,0.719059
88,Second_Booster_65Plus,0.703073
105,Bivalent_Booster_65Plus,0.684440
86,Second_Booster_50Plus,0.675014
...,...,...
26,Admin_Per_100k_65Plus,-0.210792
74,Additional_Doses_12Plus_Vax_Pct,-0.214502
47,Series_Complete_65PlusPop_Pct,-0.218909
37,Administered_Dose1_Recip_65PlusPop_Pct,-0.224179


In [22]:
print(correlations_df.loc[correlations_df['feature'] == 'region_x_East'])
print(correlations_df.loc[correlations_df['feature'] == 'region_x_Midwest'])
print(correlations_df.loc[correlations_df['feature'] == 'region_x_South'])
print(correlations_df.loc[correlations_df['feature'] == 'region_x_West'])
print(correlations_df.loc[correlations_df['feature'] == 'region_x_PR'])

           feature  correlation
110  region_y_East    -0.037023
              feature  correlation
111  region_y_Midwest    -0.040433
            feature  correlation
113  region_y_South     0.105632
           feature  correlation
114  region_y_West    -0.028365
         feature  correlation
112  region_y_PR    -0.054015


In [23]:
print(correlations_df.loc[correlations_df['feature'] == 'season_fall'])
print(correlations_df.loc[correlations_df['feature'] == 'season_winter'])
print(correlations_df.loc[correlations_df['feature'] == 'season_spring'])
print(correlations_df.loc[correlations_df['feature'] == 'season_summer'])

         feature  correlation
115  season_fall     0.026296
           feature  correlation
118  season_winter     0.273261
           feature  correlation
116  season_spring    -0.210485
           feature  correlation
117  season_summer    -0.101769


In [24]:
correlated_features = correlations_df.loc[abs(correlations_df['correlation']) >= 0.05]
correlated_features.sort_values(by = ['correlation'], ascending = False)

Unnamed: 0,feature,correlation
109,covid19_deaths,1.000000
71,Additional_Doses_5Plus,0.719059
88,Second_Booster_65Plus,0.703073
105,Bivalent_Booster_65Plus,0.684440
86,Second_Booster_50Plus,0.675014
...,...,...
116,season_spring,-0.210485
26,Admin_Per_100k_65Plus,-0.210792
74,Additional_Doses_12Plus_Vax_Pct,-0.214502
47,Series_Complete_65PlusPop_Pct,-0.218909


In [25]:
covid_deaths.to_csv('covid_deaths2.csv')