# Which COVID-19 Vaccine is Best for You?
## DS4A Team 55 Preliminary EDA


In [1]:
import pandas as pd
import numpy as np
import math
import base64
import matplotlib.pyplot as plt
import seaborn as sns
from sodapy import Socrata
import zipfile

In [2]:
# increase the size and quality of any plots we produce
import matplotlib as mpl
mpl.rcParams["figure.dpi"] = 300

## state_vax.csv
Vaccine administration data for states within the U.S., including the District of Columbia from 12/13/2020 to the present, updated daily. 

In [3]:
client = Socrata(
        domain = "data.cdc.gov",
        app_token = "bsYMLkKAQhIIVd7wzBzp5BiCR",
        timeout=10
        )
results = client.get_all("unsk-b7fc")

# convert to pandas DataFrame
df = pd.DataFrame.from_records(results)
df.sample(10)

Unnamed: 0,date,mmwr_week,location,distributed,distributed_janssen,distributed_moderna,distributed_pfizer,distributed_unk_manuf,dist_per_100k,distributed_per_100k_12plus,...,administered_dose1_recip_5pluspop_pct,series_complete_5plus,series_complete_5pluspop_pct,administered_5plus,admin_per_100k_5plus,distributed_per_100k_5plus,series_complete_moderna_5plus,series_complete_pfizer_5plus,series_complete_janssen_5plus,series_complete_unk_manuf_5plus
13320,2021-07-07T00:00:00.000,27,RI,1515025,61600,617460,835965,0,143013,163347,...,0.0,0,0.0,0,0,0,,,,
21002,2021-03-11T00:00:00.000,10,PA,5157695,143900,2646300,2367495,0,40288,0,...,0.0,0,0.0,0,0,0,,,,
1654,2022-01-04T00:00:00.000,1,IA,5658495,279800,2121240,3257455,0,179346,211431,...,69.5,1866260,63.1,4662129,157534,191202,696085.0,1012798.0,157096.0,281.0
25261,2021-01-04T00:00:00.000,1,WA,418100,0,0,0,0,5491,0,...,0.0,0,0.0,0,0,0,,,,
20630,2021-03-17T00:00:00.000,11,AK,499625,9300,237800,252525,0,68297,0,...,0.0,0,0.0,0,0,0,,,,
12277,2021-07-23T00:00:00.000,29,OH,12709325,655500,5357180,6696645,0,108728,127084,...,0.0,0,0.0,0,0,0,,,,
13434,2021-07-05T00:00:00.000,27,FL,25002165,1694500,10155500,13152165,0,116410,133730,...,0.0,0,0.0,0,0,0,,,,
167,2022-01-27T00:00:00.000,4,CT,7984815,336700,3031040,4617075,0,223960,257081,...,95.0,2722384,80.5,7063236,208751,235987,927336.0,1567517.0,227092.0,439.0
11942,2021-07-28T00:00:00.000,30,AR,2968260,178900,1352240,1437120,0,98358,116072,...,0.0,0,0.0,0,0,0,,,,
4865,2021-11-14T00:00:00.000,46,MH,65910,11700,47700,6510,0,84782,114888,...,35.6,21506,31.1,44501,64310,95248,18482.0,753.0,2270.0,1.0


In [47]:
df.sort_values(by="date", ascending=True)

Unnamed: 0,date,state,admin_12plus,admin_18plus,admin_65plus,admin_janssen,admin_moderna,admin_pfizer,admin_unk,admin_per_100k,...,additional_doses,additional_doses_vax_pct,additional_doses_18plus,additional_doses_18plus_vax_pct,additional_doses_65plus,additional_doses_65plus_vax_pct,additional_doses_moderna,additional_doses_pfizer,additional_doses_janssen,additional_doses_unk_manuf
26641,2020-12-14,ID,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0
26607,2020-12-14,DC,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0
26606,2020-12-14,WA,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0
26605,2020-12-14,MD,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0
26603,2020-12-14,WY,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-01-29,NC,15280397,14370812,4197440,502124,5869748,9328269,918,149704,...,1459952,23.9,1425205,25.6,561289,37.6,644258,789929,25748,17
37,2022-01-29,MI,14308849,13527676,4234979,455337,5740554,8489547,2048,147068,...,2795353,48.1,2733582,51.5,1102711,71.9,1224362,1526512,44394,85
36,2022-01-29,NJ,15522639,14461353,3626978,552989,6084931,9364673,1492,180182,...,2703990,41.9,2622000,45.3,851946,63.8,1183220,1464654,56074,42
34,2022-01-29,GA,13719253,12890570,3377194,328603,5504584,8143312,36712,131983,...,1874057,33.4,1837097,35.7,656243,52.9,846122,1004435,22310,1190


In [4]:
# df.shape

In [5]:
# remove all distribution cols
# remove "Administered", this is the total for 1st dose and complete series
# remove all 5plus cols, very recently approved and sporadic across states
# remove additional doses for 50plus, age group doesn't show up elsewhere in dataset


df = df.drop(columns= ["mmwr_week","distributed", "distributed_janssen", "distributed_moderna", 
                       "distributed_pfizer", "distributed_unk_manuf", "dist_per_100k","distributed_per_100k_12plus",
                       "distributed_per_100k_18plus", "distributed_per_100k_65plus", "distributed_per_100k_5plus",
                       "administered","administered_dose1_recip_5plus", "recip_administered",
                       "administered_dose1_recip_5pluspop_pct", "series_complete_5plus", "series_complete_5pluspop_pct", 
                       "administered_5plus", "admin_per_100k_5plus", "series_complete_moderna_5plus",
                       "series_complete_pfizer_5plus", "series_complete_janssen_5plus", "series_complete_unk_manuf_5plus",
                       "additional_doses_50plus", "additional_doses_50plus_vax_pct"], axis=1)

In [6]:
# abbreviating column names for readability
# cols in api data are labelled differently than csv directly from data.cdc.gov, values match up so renaming those cols

rename_cols = {"administered_12plus":"admin_12plus",
               "location":"state",
               "administered_18plus":"admin_18plus",
               "administered_65plus":"admin_65plus",
               "administered_janssen":"admin_janssen",
               "administered_moderna":"admin_moderna",
               "administered_pfizer":"admin_pfizer",
               "administered_unk_manuf":"admin_unk",
               "administered_dose1_recip":"admin_dose1",
               "administered_dose1_pop_pct":"admin_dose1_pop_pct",
               "administered_dose1_recip_1":"admin_dose1_12plus",
               "administered_dose1_recip_2":"admin_dose1_12plus_pop_pct",
               "administered_dose1_recip_3":"admin_dose1_18plus",
               "administered_dose1_recip_4":"admin_dose1_18plus_pop_pct",
               "administered_dose1_recip_5":"admin_dose1_65plus",
               "administered_dose1_recip_6":"admin_dose1_65plus_pop_pct",
               "series_complete_yes":"series_complete",
               "series_complete_12pluspop":"series_complete_12plus_pop_pct",
               "series_complete_18pluspop":"series_complete_18plus_pop_pct",
               "series_complete_65pluspop":"series_complete_65plus_pop_pct",
               "series_complete_unk_manuf":"series_complete_unk",
               "series_complete_unk_manuf_1":"series_complete_unk_12plus",
               "series_complete_unk_manuf_2":"series_complete_unk_18plus",
               "series_complete_unk_manuf_3":"series_complete_unk_65plus",
               "series_complete_12pluspop_pct":"series_complete_12plus_pop_pct",
               "series_complete_18pluspop_pct":"series_complete_18plus_pop_pct",
               "series_complete_65pluspop_pct":"series_complete_65plus_pop_pct"
               }
df = df.rename(columns=rename_cols)

In [7]:
# df.columns

In [8]:
# see data types of df
# all types = object, need to change to ints and floats
# df.dtypes

In [9]:
# show cols with NaNs
# cols with NaNs are additional dose cols, after investigating this happens when other additional dose cols are 0
df.columns[df.isnull().any()]

Index(['additional_doses', 'additional_doses_vax_pct',
       'additional_doses_18plus', 'additional_doses_18plus_vax_pct',
       'additional_doses_65plus', 'additional_doses_65plus_vax_pct',
       'additional_doses_moderna', 'additional_doses_pfizer',
       'additional_doses_janssen', 'additional_doses_unk_manuf'],
      dtype='object')

In [10]:
# replace NaNs with 0 to fill in values
nan_col = ['additional_doses', 'additional_doses_vax_pct',
           'additional_doses_18plus', 'additional_doses_18plus_vax_pct',
           'additional_doses_65plus', 'additional_doses_65plus_vax_pct',
           'additional_doses_moderna', 'additional_doses_pfizer',
           'additional_doses_janssen', 'additional_doses_unk_manuf']
df[nan_col] = df[nan_col].fillna(0)

In [11]:
to_int = ['admin_12plus', 'admin_18plus', 'admin_65plus','admin_janssen',
          'admin_moderna', 'admin_pfizer', 'admin_unk','admin_per_100k',
          'admin_per_100k_12plus', 'admin_per_100k_18plus','admin_per_100k_65plus',
          'admin_dose1', 'admin_dose1_12plus', 'admin_dose1_18plus',
          'admin_dose1_65plus', 'series_complete_12plus', 'series_complete_18plus',
          'series_complete','series_complete_65plus', 'series_complete_janssen', 'series_complete_moderna',
          'series_complete_pfizer','series_complete_unk', 'series_complete_janssen_12plus',
          'series_complete_moderna_12plus', 'series_complete_pfizer_12plus', 'series_complete_unk_12plus',
          'series_complete_janssen_18plus', 'series_complete_moderna_18plus', 'series_complete_pfizer_18plus',
          'series_complete_unk_18plus', 'series_complete_janssen_65plus', 'series_complete_moderna_65plus',
          'series_complete_pfizer_65plus', 'series_complete_unk_65plus', 'additional_doses',
          'additional_doses_18plus', 'additional_doses_65plus', 'additional_doses_moderna',
          'additional_doses_pfizer', 'additional_doses_janssen', 'additional_doses_unk_manuf'
         ]

to_float = ['admin_dose1_pop_pct', 'admin_dose1_12plus_pop_pct', 'admin_dose1_18plus_pop_pct',
            'admin_dose1_65plus_pop_pct', 'series_complete_pop_pct', 'series_complete_12plus_pop_pct',
            'series_complete_12plus_pop_pct', 'series_complete_18plus_pop_pct', 'series_complete_65plus_pop_pct',
            'additional_doses_vax_pct', 'additional_doses_18plus_vax_pct', 'additional_doses_65plus_vax_pct'
           ]

df[to_int] = df[to_int].astype("int")
df[to_float] = df[to_float].astype("float")

In [12]:
# checking that data types hace been converted correctly
# df.dtypes

In [13]:
# removing locations outside of the contential united states as well as united states total

drop_location = ["AS", "BP2", "DD2", "FM", "GU",
                 "IH2", "MH", "MP", "PR", "RP",
                 "US", "VA2", "VI", "LTC"]
df = df[~df["state"].isin(drop_location)]

# checking that location vals = 51, including DC

df.state.nunique()

51

In [14]:
# converting "date" col to datetime
df["date"] = pd.to_datetime(df["date"])

In [15]:
# filter location to New York
df_ny = df.loc[df["state"] == "NY"]

In [16]:
# creating a new df with all values on a daily basis instead of a running total
df_ny_daily = df_ny.drop(columns="state").set_index("date")
df_ny_daily = df_ny_daily.diff(periods=-1)
df_ny_daily.head(50)

Unnamed: 0_level_0,admin_12plus,admin_18plus,admin_65plus,admin_janssen,admin_moderna,admin_pfizer,admin_unk,admin_per_100k,admin_per_100k_12plus,admin_per_100k_18plus,...,additional_doses,additional_doses_vax_pct,additional_doses_18plus,additional_doses_18plus_vax_pct,additional_doses_65plus,additional_doses_65plus_vax_pct,additional_doses_moderna,additional_doses_pfizer,additional_doses_janssen,additional_doses_unk_manuf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-29,61485.0,55269.0,7659.0,1403.0,20333.0,47749.0,117.0,357.0,366.0,359.0,...,35979.0,0.2,32669.0,0.2,5271.0,0.2,13211.0,22050.0,712.0,6.0
2022-01-28,58849.0,53267.0,7770.0,1242.0,20191.0,45159.0,90.0,343.0,351.0,345.0,...,33572.0,0.2,30765.0,0.2,5327.0,0.2,12986.0,20009.0,569.0,8.0
2022-01-27,56316.0,50669.0,7246.0,1249.0,18474.0,44217.0,102.0,329.0,336.0,328.0,...,31769.0,0.2,28954.0,0.2,4967.0,0.1,12033.0,19226.0,507.0,3.0
2022-01-26,66938.0,59658.0,8305.0,1499.0,22891.0,50887.0,71.0,388.0,399.0,387.0,...,38505.0,0.2,34484.0,0.2,5864.0,0.2,14411.0,23470.0,620.0,4.0
2022-01-25,42522.0,38668.0,5631.0,1076.0,15599.0,30033.0,64.0,240.0,254.0,251.0,...,23604.0,0.1,21547.0,0.2,3730.0,0.1,9704.0,13407.0,490.0,3.0
2022-01-24,40555.0,33767.0,3418.0,504.0,13459.0,31291.0,10.0,233.0,241.0,219.0,...,26433.0,0.2,21501.0,0.1,2487.0,0.1,9295.0,16897.0,241.0,0.0
2022-01-23,59130.0,49305.0,5272.0,1078.0,20226.0,48412.0,27.0,358.0,353.0,319.0,...,39135.0,0.2,32453.0,0.3,3893.0,0.1,14350.0,24278.0,505.0,2.0
2022-01-22,69000.0,61743.0,8104.0,1563.0,24917.0,51470.0,122.0,402.0,411.0,401.0,...,41549.0,0.3,37577.0,0.2,5844.0,0.2,16647.0,24168.0,733.0,1.0
2022-01-21,69283.0,62101.0,8413.0,1741.0,24804.0,52842.0,96.0,408.0,413.0,402.0,...,40908.0,0.2,37008.0,0.3,6024.0,0.2,16351.0,23825.0,731.0,1.0
2022-01-20,76876.0,69401.0,9547.0,1509.0,26537.0,58824.0,130.0,448.0,459.0,450.0,...,44969.0,0.3,40904.0,0.3,6843.0,0.2,17486.0,26694.0,784.0,5.0


In [17]:
# this value appears as an outlier but was manually calculated from raw dataset and is correct
print(df_ny_daily["admin_pfizer"].max())

581745.0


In [18]:
# set index to date to extract rows for the first of each month, aligns with how other datasets are reported
df_ny_m = df_ny.set_index("date")
df_ny_m = df_ny_m.groupby(df_ny_m.index.strftime("%Y-%m")).tail(1)
df_ny_m = df_ny_m.reset_index().sort_values(by="date", ascending=True)
# df_ny_m.head(10)

In [19]:
# first reported date and most recent date for vax data, updated daily
print(df_ny["date"].min())
print(df_ny["date"].max())

2020-12-14 00:00:00
2022-01-29 00:00:00


# covid_cases
Covid cases in NY

In [20]:
# #read data
# covid_df = pd.read_csv("data/covid_cases_4states.csv.zip", compression="zip")
# covid_df.head()

with zipfile.ZipFile("data/covid_cases_4states.csv.zip") as z:
    with z.open("covid_cases_4states.csv") as f:
        covid_df = pd.read_csv(f, low_memory=False)

covid_df.head()
# # open zipped dataset
# with zipfile.ZipFile("test.zip") as z:
#    # open the csv file in the dataset
#    with z.open("test.csv") as f:
       
#       # read the dataset
#       train = pd.read_csv(f)

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-10,NY,36,CLINTON,36019.0,65+ years,Female,Missing,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
1,2021-07,FL,12,BAKER,12003.0,50 to 64 years,Female,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
2,2020-09,NY,36,STEUBEN,36101.0,50 to 64 years,Female,Missing,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
3,2021-02,FL,12,MANATEE,12081.0,18 to 49 years,Female,Unknown,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
4,2021-03,FL,12,MARION,12083.0,50 to 64 years,Female,Unknown,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,


In [21]:
covid_df.columns #column names

Index(['case_month', 'res_state', 'state_fips_code', 'res_county',
       'county_fips_code', 'age_group', 'sex', 'race', 'ethnicity',
       'case_positive_specimen_interval', 'case_onset_interval', 'process',
       'exposure_yn', 'current_status', 'symptom_status', 'hosp_yn', 'icu_yn',
       'death_yn', 'underlying_conditions_yn'],
      dtype='object')

In [22]:
# covid_df.dtypes

In [23]:
covid_df.shape

(11685325, 19)

In [24]:
#data cleaning
covid_df['case_month'] = pd.to_datetime(covid_df['case_month']) #convert to date

In [25]:
# dropping irrelevant columns
covid_df = covid_df.drop(columns=["case_positive_specimen_interval", "case_onset_interval", "process", "exposure_yn", "current_status"])

In [26]:
# rename cols for readability and uniformify across datasets
covid_rename = {"case_month":"date", "res_state":"state",
               "res_county":"county"}

covid_df = covid_df.rename(columns=covid_rename)

In [27]:
# converting data types 
to_str = ["state", "county", "age_group", "sex", "race", "ethnicity",
          "symptom_status", "hosp_yn", "icu_yn", "death_yn", "underlying_conditions_yn"]
covid_df[to_str] = covid_df[to_str].astype(str)

In [28]:
# show cols with NaNs
covid_df.columns[covid_df.isnull().any()]

Index(['county_fips_code'], dtype='object')

In [29]:
# fill NaNs in county_fips_code with "Not Available"
covid_df["county_fips_code"] = covid_df["county_fips_code"].fillna("Not Available")

# shows all values within the race col
covid_df.race.unique()

# rename nan, Missing, and Unknown to Not Available
rename_covid_nan = {"nan":"Not Available",
                     "Missing":"Not Available",
                     "Unknown":"Not Available"}
covid_df["race"] = covid_df["race"].replace(rename_covid_nan).astype(str)

# same treatment for ethnicity as the race col
covid_df.ethnicity.unique()

covid_df["ethnicity"] = covid_df["ethnicity"].replace(rename_covid_nan).astype(str)

covid_df["county"] = covid_df["county"].replace({"nan":"Not Available"})

# checking for NaN string values and replace with "Not Available" in remaining cols
covid_df.age_group.unique()
covid_df["age_group"] = covid_df["age_group"].replace({"nan":"Not Available", "Missing":"Not Available"}).astype(str)

covid_df["sex"] = covid_df["sex"].replace(rename_covid_nan).astype(str)

covid_df["death_yn"] = covid_df["death_yn"].replace(rename_covid_nan).astype(str)

covid_df["underlying_conditions_yn"] = covid_df["underlying_conditions_yn"].replace(rename_covid_nan).astype(str)

rename_missing_unk = {"Missing":"Not Available",
                      "Unknown":"Not Available"}
rename_col = ["symptom_status", "hosp_yn", "icu_yn"]
covid_df[rename_col] = covid_df[rename_col].replace(rename_missing_unk).astype(str)

In [30]:
covid_df.sample(10, random_state = 42)

Unnamed: 0,date,state,state_fips_code,county,county_fips_code,age_group,sex,race,ethnicity,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
11639060,2021-12-01,TX,48,Not Available,Not Available,18 to 49 years,Male,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
7365273,2021-01-01,CA,6,KERN,6029.0,18 to 49 years,Female,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
2826409,2020-05-01,CA,6,MONTEREY,6053.0,18 to 49 years,Male,White,Hispanic/Latino,Symptomatic,No,Not Available,Not Available,Not Available
3894756,2021-08-01,NY,36,ORANGE,36071.0,18 to 49 years,Female,Not Available,Hispanic/Latino,Symptomatic,No,Not Available,No,Not Available
8602615,2021-08-01,NY,36,SUFFOLK,36103.0,65+ years,Male,Not Available,Not Available,Not Available,Not Available,Not Available,No,Not Available
10347080,2021-09-01,FL,12,MIAMI-DADE,12086.0,65+ years,Female,White,Hispanic/Latino,Not Available,No,Not Available,No,Not Available
5423655,2021-09-01,NY,36,MONROE,36055.0,0 - 17 years,Female,Black,Non-Hispanic/Latino,Not Available,Not Available,Not Available,No,Not Available
5897489,2020-12-01,CA,6,ALAMEDA,6001.0,18 to 49 years,Male,Multiple/Other,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
9556751,2021-01-01,NY,36,QUEENS,36081.0,18 to 49 years,Female,White,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
1140622,2021-09-01,NY,36,CHAUTAUQUA,36013.0,18 to 49 years,Female,Not Available,Not Available,Not Available,Not Available,Not Available,No,Not Available


In [31]:
# covid_df.describe(include = 'all',datetime_is_numeric=True) #describe and include string columns

In [32]:
#covid cases trend
# trend_df = covid_df.groupby(['date']).size().reset_index(name = 'case_count').set_index('date')
# trend_df['case_count_per_100k'] = trend_df['case_count']/100000
# plt.plot(trend_df.index,trend_df['case_count_per_100k'])
# plt.title('COVID-19 Cases Have Spiked Twice And Is Trending Upwards')
# plt.xlabel('Date')
# plt.ylabel('Case Count (Hundred Thousand)')
# plt.xticks(rotation=90)
# plt.show()

In [34]:
# trend_df.head()

In [35]:
# comparing cases with vaccine administration rates
# case_vax_ax = ny_m_plt.plot(x="date", y=["admin_per_100k_12plus", "admin_per_100k_18plus", "admin_per_100k_65plus"], kind="line")
# trend_df.plot(y="case_count", kind="line", ax=case_vax_ax, title="COVID-19 Cases and Vaccinations per Age Group");                            

When comparing the COVID case trends with the trends in vaccine administration across each age group, it can be seen that the number of cases starts to sharply decline as the population recieves the vaccine. The case_count trend line was already falling when the vaccines were made available. However, once more of the population started to get vaccinated the case numbers stayed low up until the omicron variant began spreading around November of 2021 up until now (mid-January 2022).

In [36]:
# case_comp_ax = daily_plt.plot(x="date", y='series_complete', kind="line")
# trend_df.plot(y="case_count", kind="line", ax=case_comp_ax, title="COVID-19 Cases and Complete Vaccine Series");

In [37]:
# covid_65plus = covid_df.loc[covid_df["age_group"] == "65+ years"]


In [38]:
# df_covid65 = covid_65plus.groupby(['date']).size().reset_index(name = 'case_count').set_index('date')

# df_covid65.plot(title="Trend in COVID-19 Cases for Age 65+");

In [39]:
# covid65_ax = ny_m_plt.plot(x="date", y="admin_per_100k_65plus", kind="line")
# df_covid65.plot(y="case_count", kind="line", ax=covid65_ax, title="COVID-19 Cases and Vaccinations for Age 65+");

# Analyzing the Impact of Different Factors on Case Trends
### Applying multiple regression to a new data frame containing case counts for each state by month, vaccine administration per month for each manufacturer, and analyzing demographic factors

In [40]:
# cases per state

covid_4state = covid_df.groupby(["date", "state"], as_index=False).size()
covid_4state = covid_4state.rename(columns={"size":"cases"})
covid_4state = covid_4state[~(covid_4state["date"] <= "2020-12-01")]


In [41]:
covid_4state.head(10)

Unnamed: 0,date,state,cases
48,2021-01-01,CA,877185
49,2021-01-01,FL,207612
50,2021-01-01,NY,333399
51,2021-01-01,TX,5057
52,2021-02-01,CA,182380
53,2021-02-01,FL,165000
54,2021-02-01,NY,199327
55,2021-02-01,TX,2108
56,2021-03-01,CA,83365
57,2021-03-01,FL,330204


In [42]:
keep_state = ["NY", "CA", "FL", "TX"]
df_4state = df.loc[df["state"].isin(keep_state)]


# df_4state_m = df_4state.set_index("date")
# df_4state_m = df_4state_m.groupby(df_4state_m.index.strftime("%Y-%m")).tail(1)
# df_4state_m = df_4state_m.reset_index().sort_values(by="date", ascending=True)
# manuf_4state_m = df_4state_m[["date","state", "admin_janssen", "admin_moderna", "admin_pfizer", "admin_unk"]]

In [43]:
df_4state_m = df_4state[df_4state["date"].dt.is_month_start].sort_values(by="date", ascending=True)
manuf_4state_m = df_4state_m[["date","state", "admin_janssen", "admin_moderna", "admin_pfizer", "admin_unk"]]


In [44]:
manuf_4state_m.head(10)

Unnamed: 0,date,state,admin_janssen,admin_moderna,admin_pfizer,admin_unk
25508,2021-01-01,TX,0,221764,301502,0
25477,2021-01-01,FL,0,104492,133296,21
25504,2021-01-01,CA,0,60934,313563,0
25495,2021-01-01,NY,0,82248,146240,11
23484,2021-02-01,CA,0,1600685,1852599,0
23482,2021-02-01,TX,0,1297536,1198696,0
23471,2021-02-01,FL,0,971820,1032236,1322
23464,2021-02-01,NY,0,943356,984098,211
21656,2021-03-01,FL,0,2509245,2533476,5239
21649,2021-03-01,TX,0,2828841,2743511,144


In [45]:
covid_4state = pd.merge(covid_4state, manuf_4state_m, on=["date", "state"], how="inner")