In [1]:
## Data Cleaning
# COVID Vaccination Rates

In [2]:
# Dependencies
import pandas as pd
import io

In [3]:
# Read in data
from google.colab import files
uploaded = files.upload()

Saving COVID-19_Vaccinations_Data.csv to COVID-19_Vaccinations_Data.csv


In [4]:
# Read in CSV
vaccines = pd.read_csv(io.BytesIO(uploaded['COVID-19_Vaccinations_Data.csv']), low_memory=False)
vaccines.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_5PlusPop_Pct,...,Census2019_18PlusPop,Census2019_65PlusPop,Bivalent_Booster_5Plus,Bivalent_Booster_5Plus_Pop_Pct,Bivalent_Booster_12Plus,Bivalent_Booster_12Plus_Pop_Pct,Bivalent_Booster_18Plus,Bivalent_Booster_18Plus_Pop_Pct,Bivalent_Booster_65Plus,Bivalent_Booster_65Plus_Pop_Pct
0,05/10/2023,55129,19,Washburn County,WI,96.7,11123.0,70.8,11097.0,73.9,...,12758.0,4304.0,3978.0,26.5,3955.0,28.6,3899.0,30.6,2588.0,60.1
1,05/10/2023,19173,19,Taylor County,IA,97.3,3149.0,51.4,3145.0,55.0,...,4687.0,1392.0,841.0,14.7,841.0,16.3,834.0,17.8,588.0,42.2
2,05/10/2023,36059,19,Nassau County,NY,97.5,1391226.0,95.0,1384503.0,95.0,...,1065968.0,246690.0,236202.0,18.4,231748.0,19.8,224931.0,21.1,109947.0,44.6
3,05/10/2023,48281,19,Lampasas County,TX,98.9,11678.0,54.5,11660.0,57.4,...,16799.0,4322.0,1951.0,9.6,1947.0,10.5,1925.0,11.5,1229.0,28.4
4,05/10/2023,26145,19,Saginaw County,MI,94.0,104075.0,54.6,,,...,149676.0,37414.0,15618.0,8.7,15448.0,9.4,15112.0,10.1,7921.0,21.2


In [5]:
# Select relevant columns
vaccines_df = vaccines[['Date', 'Recip_County', 'Recip_State', 'Administered_Dose1_Recip']]
vaccines_df.head()

Unnamed: 0,Date,Recip_County,Recip_State,Administered_Dose1_Recip
0,05/10/2023,Washburn County,WI,11123.0
1,05/10/2023,Taylor County,IA,3149.0
2,05/10/2023,Nassau County,NY,1391226.0
3,05/10/2023,Lampasas County,TX,11678.0
4,05/10/2023,Saginaw County,MI,104075.0


In [6]:
# Rename columns
vaccines_df.rename(columns={
    'Recip_County': 'County',
    'Recip_State': 'State',
    'Administered_Dose1_Recip': 'Administered_Dose_Count'
    }, inplace=True)
vaccines_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vaccines_df.rename(columns={


Unnamed: 0,Date,County,State,Administered_Dose_Count
0,05/10/2023,Washburn County,WI,11123.0
1,05/10/2023,Taylor County,IA,3149.0
2,05/10/2023,Nassau County,NY,1391226.0
3,05/10/2023,Lampasas County,TX,11678.0
4,05/10/2023,Saginaw County,MI,104075.0


In [7]:
# Convert 'Date' to datetime format
vaccines_df['Date'] = pd.to_datetime(vaccines_df['Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vaccines_df['Date'] = pd.to_datetime(vaccines_df['Date'])


In [8]:
# Check format
vaccines_df.dtypes

Unnamed: 0,0
Date,datetime64[ns]
County,object
State,object
Administered_Dose_Count,float64


In [9]:
# Check to see if any values are blank (zeroes)
print(vaccines_df[['Administered_Dose_Count']].describe())

       Administered_Dose_Count
count             1.904494e+06
mean              5.574474e+04
std               2.195632e+05
min               0.000000e+00
25%               2.630000e+03
50%               9.073500e+03
75%               2.889000e+04
max               8.444862e+06


In [10]:
# Filter for non-zeroes
filtered_doses_df = vaccines_df[(vaccines_df['Administered_Dose_Count'] > 0)]

In [12]:
filtered_doses_df.head()

Unnamed: 0,Date,County,State,Administered_Dose_Count
0,2023-05-10,Washburn County,WI,11123.0
1,2023-05-10,Taylor County,IA,3149.0
2,2023-05-10,Nassau County,NY,1391226.0
3,2023-05-10,Lampasas County,TX,11678.0
4,2023-05-10,Saginaw County,MI,104075.0


In [13]:
# Create a new column for the month
filtered_doses_df['Year'] = filtered_doses_df['Date'].dt.year

# Group by Month, Recip_County, and Recip_State, then calculate the total count of vaccinated recipients
# monthly_avg = filtered_doses_df.groupby(['Month', 'County', 'State']).max().reset_index()
yearly_max = filtered_doses_df.groupby(['Year', 'County', 'State'])['Administered_Dose_Count'].max().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_doses_df['Year'] = filtered_doses_df['Date'].dt.year


In [16]:
# View data frame
yearly_max.head()

Unnamed: 0,Year,County,State,Administered_Dose_Count
0,2020,Brevard County,FL,6266.0
1,2020,Broward County,FL,21075.0
2,2020,Collier County,FL,3110.0
3,2020,Cook County,IL,56411.0
4,2020,Davidson County,TN,13706.0


In [18]:
# Assuming df is your DataFrame
yearly_max.to_csv('Vaccine_Data.csv', index=False)

# Download the CSV file
files.download('Vaccine_Data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>