In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('./source_data/COVID-19_Vaccinations_in_the_United_States_County.csv')

In [3]:
df.shape

(1486676, 53)

In [4]:
df.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_5PlusPop_Pct,Administered_Dose1_Recip_12Plus,Administered_Dose1_Recip_12PlusPop_Pct,Administered_Dose1_Recip_18Plus,Administered_Dose1_Recip_18PlusPop_Pct,Administered_Dose1_Recip_65Plus,Administered_Dose1_Recip_65PlusPop_Pct,Series_Complete_Yes,Series_Complete_Pop_Pct,Series_Complete_5Plus,Series_Complete_5PlusPop_Pct,Series_Complete_12Plus,Series_Complete_12PlusPop_Pct,Series_Complete_18Plus,Series_Complete_18PlusPop_Pct,Series_Complete_65Plus,Series_Complete_65PlusPop_Pct,Booster_Doses,Booster_Doses_Vax_Pct,Booster_Doses_12Plus,Booster_Doses_12Plus_Vax_Pct,Booster_Doses_18Plus,Booster_Doses_18Plus_Vax_Pct,Booster_Doses_50Plus,Booster_Doses_50Plus_Vax_Pct,Booster_Doses_65Plus,Booster_Doses_65Plus_Vax_Pct,SVI_CTGY,Series_Complete_Pop_Pct_SVI,Series_Complete_5PlusPop_Pct_SVI,Series_Complete_12PlusPop_Pct_SVI,Series_Complete_18PlusPop_Pct_SVI,Series_Complete_65PlusPop_Pct_SVI,Metro_status,Series_Complete_Pop_Pct_UR_Equity,Series_Complete_5PlusPop_Pct_UR_Equity,Series_Complete_12PlusPop_Pct_UR_Equity,Series_Complete_18PlusPop_Pct_UR_Equity,Series_Complete_65PlusPop_Pct_UR_Equity,Census2019,Census2019_5PlusPop,Census2019_12PlusPop,Census2019_18PlusPop,Census2019_65PlusPop
0,03/10/2022,19041,10,Clay County,IA,97.4,8985.0,56.1,8985.0,59.9,8781.0,64.5,8322.0,67.2,3149.0,93.0,8256,51.5,8256.0,55.0,8088.0,59.4,7680,62.1,2948,87.1,4356.0,52.8,4356.0,53.9,4302.0,56.0,3448.0,68.9,2320.0,78.7,B,8.0,8.0,8.0,8.0,8.0,Non-metro,8.0,8.0,8.0,8.0,8.0,16016.0,15005.0,13610.0,12375.0,3386.0
1,03/10/2022,UNK,10,Unknown County,KY,94.0,195520.0,0.0,195475.0,0.0,192474.0,0.0,184407.0,0.0,51898.0,0.0,152031,0.0,152021.0,0.0,149745.0,0.0,143320,0.0,38454,0.0,56751.0,37.3,56748.0,37.9,55827.0,39.0,41922.0,51.5,25568.0,66.5,,,,,,,,,,,,,,,,,
2,03/10/2022,28101,10,Newton County,MS,98.1,12815.0,61.0,12811.0,65.3,12529.0,71.2,11843.0,75.4,3763.0,95.0,11098,52.8,11098.0,56.6,10927.0,62.1,10348,65.9,3484,95.0,3832.0,34.5,3832.0,35.1,3754.0,36.3,3057.0,47.2,1956.0,56.1,D,16.0,16.0,16.0,16.0,16.0,Non-metro,8.0,8.0,8.0,8.0,8.0,21018.0,19614.0,17586.0,15708.0,3585.0
3,03/10/2022,UNK,10,Unknown County,GA,89.7,942846.0,0.0,941952.0,0.0,898355.0,0.0,839911.0,0.0,128901.0,0.0,591326,0.0,591191.0,0.0,563339.0,0.0,524225,0.0,71397,0.0,134445.0,22.7,134345.0,23.8,129064.0,24.6,72983.0,34.7,31647.0,44.3,,,,,,,,,,,,,,,,,
4,03/10/2022,29133,10,Mississippi County,MO,91.3,6385.0,48.4,6385.0,51.1,6327.0,55.4,6101.0,58.5,1981.0,82.8,5636,42.8,5636.0,45.1,5604.0,49.1,5425,52.0,1781,74.4,2064.0,36.6,2064.0,36.8,2052.0,37.8,1549.0,45.4,878.0,49.3,D,15.0,15.0,15.0,16.0,16.0,Non-metro,7.0,7.0,7.0,8.0,8.0,13180.0,12485.0,11412.0,10436.0,2393.0


In [5]:
df['Date']  = pd.to_datetime(df['Date'])

In [None]:
df.dtypes

In [6]:
df['Recip_County'] = df['Recip_County'] + '_' + df['Recip_State']

In [7]:
drop_states = ['AS','FM','GU','MH','MP','PR','PW','VI','UNK'] # drop anything that's not 50 contiguous + DC
df = df[df['Recip_State'].map(lambda x : x not in drop_states)]

In [8]:
df.groupby('Metro_status').mean().T

Metro_status,Metro,Non-metro
MMWR_week,24.388521,24.388521
Completeness_pct,79.315544,77.248979
Administered_Dose1_Recip,107986.335913,8199.864311
Administered_Dose1_Pop_Pct,36.854922,31.303182
Administered_Dose1_Recip_5Plus,174974.654839,12651.318842
Administered_Dose1_Recip_5PlusPop_Pct,63.66178,53.126906
Administered_Dose1_Recip_12Plus,108296.50765,8252.353173
Administered_Dose1_Recip_12PlusPop_Pct,41.766313,35.159114
Administered_Dose1_Recip_18Plus,101350.948589,7870.875607
Administered_Dose1_Recip_18PlusPop_Pct,43.472565,37.10531


In [None]:
cluster_df = df.groupby('Recip_County').mean().sort_values(by = 'Completeness_pct')

In [None]:
cluster_df.dropna(inplace = True) #won't be able to fit DBSCAN with nulls.  

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(cluster_df)
dbscan = DBSCAN()
dbscan.fit(X_scaled)

In [None]:
dbscan.labels_

In [None]:
km = KMeans(n_clusters=3, random_state=42)
km.fit(X_scaled)

In [None]:
cluster_df['cluster'] = km.labels_

In [None]:
cluster_df.groupby('cluster').mean()

### Filter to only Administered Dose Recipients, Administered Dose Pct, Series Complete Pop PCT, Booster Doses Vax Pct.  Df should have columns with data from 3 specific checkpoint dates: 030121, 090121, 030122

In [None]:
df_030121 = df.loc[(df['Date'] == '03/01/2021')] # filter to only data from this date
df_030121 = df_030121[['Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct_SVI', 'Booster_Doses_Vax_Pct']] #filter to wanted columns
df_030121 = df_030121.rename(columns={'Administered_Dose1_Recip': 'Administered_Dose1_Recip_030121',      #label each column with the date
                                      'Administered_Dose1_Pop_Pct':'Administered_Dose1_Pop_Pct_030121', 
                                      'Series_Complete_Pop_Pct_SVI':'Series_Complete_Pop_Pct_SVI_030121',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_030121'}) 

df_090121 = df.loc[(df['Date'] == '09/01/2021')]
df_090121 = df_090121[['Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct_SVI', 'Booster_Doses_Vax_Pct']]
df_090121 = df_090121.rename(columns={'Administered_Dose1_Recip': 'Administered_Dose1_Recip_090121', 
                                      'Administered_Dose1_Pop_Pct':'Administered_Dose1_Pop_Pct_090121', 
                                      'Series_Complete_Pop_Pct_SVI':'Series_Complete_Pop_Pct_SVI_090121',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_090121'})

df_030122 = df.loc[(df['Date'] == '03/01/2022')]
df_030122 = df_030122[['Recip_County','Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Series_Complete_Pop_Pct_SVI', 'Booster_Doses_Vax_Pct']]
df_030122 = df_030122.rename(columns={'Administered_Dose1_Recip': 'Administered_Dose1_Recip_030122',
                                      'Administered_Dose1_Pop_Pct':'Administered_Dose1_Pop_Pct_030122', 
                                      'Series_Complete_Pop_Pct_SVI':'Series_Complete_Pop_Pct_SVI_030122',
                                      'Booster_Doses_Vax_Pct': 'Booster_Doses_Vax_Pct_030122'})

data_3dates_df = pd.merge(df_030121,df_090121).merge(df_030122) # merge all three dfs
data_3dates_df.set_index('Recip_County', inplace = True) # set index to county name
data_3dates_df.head()

In [None]:
df_030121.shape,df_090121.shape, df_030122.shape, data_3dates_df.shape

In [None]:
data_3dates_df.fillna('0', inplace = True) # In this data, NaNs are stand-ins for "0", so imputing with "0".

In [None]:
data_3dates_df.isnull().sum()

In [None]:
data_3dates_df.to_csv('./clean_data/checkpoint_cols.csv')