**Vaccination Checkpoint Data Extraction**

Andy Orfalea

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('./source_data/COVID-19_Vaccinations_in_the_United_States_County.csv')

In [None]:
df.shape

In [None]:
df.head()

In [3]:
df['Date']  = pd.to_datetime(df['Date'])

In [None]:
df.dtypes

In [4]:
df['Recip_County'] = df['Recip_County'] + '_' + df['Recip_State']

In [5]:
drop_states = ['AS','FM','GU','MH','MP','PR','PW','VI','UNK'] # drop anything that's not 50 contiguous + DC
df = df[df['Recip_State'].map(lambda x : x not in drop_states)]

In [None]:
df.groupby('Metro_status').mean().T

In [None]:
cluster_df = df.groupby('Recip_County').mean().sort_values(by = 'Completeness_pct')

In [None]:
cluster_df.dropna(inplace = True) #won't be able to fit DBSCAN with nulls.  

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(cluster_df)
dbscan = DBSCAN()
dbscan.fit(X_scaled)

In [None]:
dbscan.labels_

In [None]:
km = KMeans(n_clusters=3, random_state=42)
km.fit(X_scaled)

In [None]:
cluster_df['cluster'] = km.labels_

In [None]:
cluster_df.groupby('cluster').mean()

### Filter to only Administered Dose Recipients, Administered Dose Pct, Series Complete Pop PCT SVI, Booster Doses Vax Pct.  Df should have columns with data from 3 specific checkpoint dates: 030121, 090121, 030122

In [7]:
df_030121 = df.loc[(df['Date'] == '03/01/2021')] # filter to only data from this date
df_030121 = df_030121[['FIPS', 'Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct', 'Booster_Doses_Vax_Pct']] #filter to wanted columns
df_030121 = df_030121.rename(columns={'Administered_Dose1_Recip': 'Dose1_Recip_030121',      #label each column with the date
                                      'Administered_Dose1_Pop_Pct':'Dose1_Pop_Pct_030121', 
                                      'Series_Complete_Pop_Pct':'Series_Complete_Pop_Pct_030121',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_030121'}) 

df_090121 = df.loc[(df['Date'] == '09/01/2021')]
df_090121 = df_090121[['Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct', 'Booster_Doses_Vax_Pct']]
df_090121 = df_090121.rename(columns={'Administered_Dose1_Recip': 'Dose1_Recip_090121', 
                                      'Administered_Dose1_Pop_Pct':'Dose1_Pop_Pct_090121', 
                                      'Series_Complete_Pop_Pct':'Series_Complete_Pop_Pct_090121',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_090121'})

df_030122 = df.loc[(df['Date'] == '03/01/2022')]
df_030122 = df_030122[['Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct', 'Booster_Doses_Vax_Pct']]
df_030122 = df_030122.rename(columns={'Administered_Dose1_Recip': 'Dose1_Recip_030122',
                                      'Administered_Dose1_Pop_Pct':'Dose1_Pop_Pct_030122', 
                                      'Series_Complete_Pop_Pct':'Series_Complete_Pop_Pct_030122',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_030122'})

data_3dates_df = pd.merge(df_030121,df_090121).merge(df_030122) # merge all three dfs
data_3dates_df.set_index('Recip_County', inplace = True) # set index to county name
data_3dates_df.head()

Unnamed: 0_level_0,FIPS,Dose1_Recip_030121,Dose1_Pop_Pct_030121,Series_Complete_Pop_Pct_030121,Booster_Doses_Vax_Pct_030121,Dose1_Recip_090121,Dose1_Pop_Pct_090121,Series_Complete_Pop_Pct_090121,Booster_Doses_Vax_Pct_090121,Dose1_Recip_030122,Dose1_Pop_Pct_030122,Series_Complete_Pop_Pct_030122,Booster_Doses_Vax_Pct_030122
Recip_County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Mason County_MI,26105,5795.0,19.9,12.1,,,0.0,51.7,,17777.0,61.0,60.1,58.2
Colusa County_CA,6011,2666.0,12.4,3.8,,11806.0,54.8,45.8,,14465.0,67.1,59.7,33.6
Henry County_AL,1067,2369.0,13.8,6.1,,6804.0,39.5,30.7,,9335.0,54.3,44.9,33.8
Pulaski County_KY,21199,8231.0,12.7,6.1,,28835.0,44.4,38.2,,33758.0,52.0,45.6,40.9
Franklin city_VA,51620,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0,0.0


In [8]:
df_030121.shape,df_090121.shape, df_030122.shape, data_3dates_df.shape

((3191, 6), (3191, 5), (3193, 5), (3191, 13))

In [9]:
data_3dates_df.fillna('0', inplace = True) # In this data, NaNs are stand-ins for "0", so imputing with "0".

In [10]:
data_3dates_df.isnull().sum()

FIPS                              0
Dose1_Recip_030121                0
Dose1_Pop_Pct_030121              0
Series_Complete_Pop_Pct_030121    0
Booster_Doses_Vax_Pct_030121      0
Dose1_Recip_090121                0
Dose1_Pop_Pct_090121              0
Series_Complete_Pop_Pct_090121    0
Booster_Doses_Vax_Pct_090121      0
Dose1_Recip_030122                0
Dose1_Pop_Pct_030122              0
Series_Complete_Pop_Pct_030122    0
Booster_Doses_Vax_Pct_030122      0
dtype: int64

In [11]:
data_3dates_df.index = data_3dates_df.index.map(lambda x: x.lower())

In [12]:
data_3dates_df.index = data_3dates_df.index.str.replace(' county', '')

In [13]:
data_3dates_df.to_csv('./clean_data/checkpoints_rev.csv')