In [2]:
## Data Cleaning // Alanis

In [40]:
# Dependencies
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [42]:
vaccines = pd.read_csv('Data/Covid-19_Vaccinations_Data.csv')
vaccines.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_5PlusPop_Pct,...,Census2019_18PlusPop,Census2019_65PlusPop,Bivalent_Booster_5Plus,Bivalent_Booster_5Plus_Pop_Pct,Bivalent_Booster_12Plus,Bivalent_Booster_12Plus_Pop_Pct,Bivalent_Booster_18Plus,Bivalent_Booster_18Plus_Pop_Pct,Bivalent_Booster_65Plus,Bivalent_Booster_65Plus_Pop_Pct
0,05/10/2023,55129,19,Washburn County,WI,96.7,11123.0,70.8,11097.0,73.9,...,12758.0,4304.0,3978.0,26.5,3955.0,28.6,3899.0,30.6,2588.0,60.1
1,05/10/2023,19173,19,Taylor County,IA,97.3,3149.0,51.4,3145.0,55.0,...,4687.0,1392.0,841.0,14.7,841.0,16.3,834.0,17.8,588.0,42.2
2,05/10/2023,36059,19,Nassau County,NY,97.5,1391226.0,95.0,1384503.0,95.0,...,1065968.0,246690.0,236202.0,18.4,231748.0,19.8,224931.0,21.1,109947.0,44.6
3,05/10/2023,48281,19,Lampasas County,TX,98.9,11678.0,54.5,11660.0,57.4,...,16799.0,4322.0,1951.0,9.6,1947.0,10.5,1925.0,11.5,1229.0,28.4
4,05/10/2023,26145,19,Saginaw County,MI,94.0,104075.0,54.6,,,...,149676.0,37414.0,15618.0,8.7,15448.0,9.4,15112.0,10.1,7921.0,21.2


In [83]:
vaccines_df = vaccines[['Date', 'Recip_County', 'Recip_State', 'Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct']]
vaccines_df.head()

Unnamed: 0,Date,Recip_County,Recip_State,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct
0,05/10/2023,Washburn County,WI,11123.0,70.8
1,05/10/2023,Taylor County,IA,3149.0,51.4
2,05/10/2023,Nassau County,NY,1391226.0,95.0
3,05/10/2023,Lampasas County,TX,11678.0,54.5
4,05/10/2023,Saginaw County,MI,104075.0,54.6


In [85]:
# Rename columns
vaccines_df.rename(columns={
    'Recip_County': 'County',
    'Recip_State': 'State',
    'Administered_Dose1_Recip': 'Administered_Dose_Count',
    'Administered_Dose1_Pop_Pct': 'Administered_Dose_Percent'
    }, inplace=True)
vaccines_df.head()

Unnamed: 0,Date,County,State,Administered_Dose_Count,Administered_Dose_Percent
0,05/10/2023,Washburn County,WI,11123.0,70.8
1,05/10/2023,Taylor County,IA,3149.0,51.4
2,05/10/2023,Nassau County,NY,1391226.0,95.0
3,05/10/2023,Lampasas County,TX,11678.0,54.5
4,05/10/2023,Saginaw County,MI,104075.0,54.6


In [87]:
# Convert 'Date' to datetime format
vaccines_df['Date'] = pd.to_datetime(vaccines_df['Date'])

In [89]:
# Check format
vaccines_df.dtypes

Date                         datetime64[ns]
County                               object
State                                object
Administered_Dose_Count             float64
Administered_Dose_Percent           float64
dtype: object

In [103]:
# Check to see if any values are blank (zeroes)
print(vaccines_df[['Administered_Dose_Count', 'Administered_Dose_Percent']].describe())

       Administered_Dose_Count  Administered_Dose_Percent
count             1.904494e+06               1.938696e+06
mean              5.574474e+04               3.950979e+01
std               2.195632e+05               2.494104e+01
min               0.000000e+00               0.000000e+00
25%               2.630000e+03               1.930000e+01
50%               9.073500e+03               4.360000e+01
75%               2.889000e+04               5.700000e+01
max               8.444862e+06               1.000000e+02


In [105]:
# Filter for non-zeroes
filtered_doses_df = vaccines_df[(vaccines_df['Administered_Dose_Count'] > 0) | 
                                    (vaccines_df['Administered_Dose_Percent'] > 0)]

In [107]:
# Create a new column for the month
filtered_doses_df['Month'] = filtered_doses_df['Date'].dt.to_period('M')

# Group by Month, Recip_County, and Recip_State, then calculate the average
monthly_avg = filtered_doses_df.groupby(['Month', 'County', 'State']).mean().reset_index()

In [109]:
monthly_avg.head()

Unnamed: 0,Month,County,State,Date,Administered_Dose_Count,Administered_Dose_Percent
0,2020-12,Brevard County,FL,2020-12-30 00:00:00,5632.0,0.933333
1,2020-12,Broward County,FL,2020-12-29 12:00:00,17868.75,0.925
2,2020-12,Collier County,FL,2020-12-30 12:00:00,2520.5,0.65
3,2020-12,Cook County,IL,2020-12-27 12:00:00,43518.0,0.8375
4,2020-12,Davidson County,TN,2020-12-30 12:00:00,12752.5,1.85


In [111]:
# Save as CSV file
monthly_avg.to_csv('Data/Vaccine_Data.csv', index=False)