In [1]:
import pandas as pd
import numpy as np
#import total people vaccinated dataset from OurWorldinData
sys_df = pd.read_csv("us-covid-19-total-people-vaccinated.csv")
#regions in the vaccination dataset that didn't match one of fifty United States
unwanted_regions = ["American Samoa", "Bureau of Prisons", "Dept of Defense", "Federated States of Micronesia", "Guam", "Long Term Care", "Indian Health Svc", "Marshall Islands", "Northern Mariana Islands", "Puerto Rico", "Republic of Palau", "Virgin Islands", "Veterans Health", "United States"]
clean_df = sys_df[sys_df['Entity'].isin(unwanted_regions) == False]

print("Scenario One: reaching herd immunity looking purely at vaccination")

us_pop = 332915073 #US population value from macrotrends

#Creates entry for each day that sums all vaccinations across the US
nt_df = clean_df.groupby('Day').sum()

nt_df = nt_df.reset_index()

nt_df_og_tail = nt_df.tail()

#sentiment analysis data was collected on April 28th and manually inputted into these variables

negative_tweet_percent = .4

positive_tweet_percent = .3

neutral_tweet_percent = .3

#prediction model loop
while nt_df.iloc[-1]['people_vaccinated'] < us_pop: 
    #grab last ten days of entries
    ten_days = nt_df.iloc[-9:]['people_vaccinated']

    #create array containing the differences in total US vaccinations between entries for the past ten days
    ten_days_inc = ten_days.diff()
    
    #average increase in the total number of people vaccinated in the past ten days
    ten_day_ave = np.mean(ten_days_inc)
    
    
    #sentiment penalty system that runs for the first month of predictions
    if (negative_tweet_percent > positive_tweet_percent) and (pd.to_datetime(nt_df.iloc[-1]['Day']) < pd.to_datetime('2021-05-28')): 
        ten_day_ave = ten_day_ave * 0.98

    #next day prediction = previous day value + average increase from past ten days
    next_day_val = nt_df.iloc[-1]['people_vaccinated'] + ten_day_ave
    
    #reformat next day date for new prediction entry
    next_day_date = (pd.to_datetime(nt_df.iloc[-1]['Day']) + pd.to_timedelta(np.ceil(1), unit="D")).date()

    nt_df = nt_df.append({'Day': next_day_date, 'people_vaccinated': next_day_val}, ignore_index=True)

#Create a new column containing the percent of the US vaccinated so far
nt_df['percent_vaccinated'] = nt_df['people_vaccinated'] / us_pop

nt_df.tail()

Scenario One: reaching herd immunity assuming that all people who get vaccinated includes all the people that had Covid in the past


Unnamed: 0,Day,people_vaccinated,percent_vaccinated
220,2021-08-24,328364900.0,0.986332
221,2021-08-25,329875900.0,0.990871
222,2021-08-26,331387000.0,0.99541
223,2021-08-27,332898100.0,0.999949
224,2021-08-28,334409200.0,1.004488


In [2]:
print("Looking at scenario one's dataset predictions, we can see that the country reaches 100% herd immunity on August 1st, 2021")

Looking at scenario one's dataset predictions, we can see that the country reaches 100% herd immunity on August 1st, 2021


In [3]:
#Data Setup
#people_vac_df: holds data for total people vaccinated in US from 2021-01-12 to 2021-04-04
#people_case_df: holds data for total people with covid in US from 2020-01-22 to 2021-04-09

#step 1: get both dataframes to line up on dates by slicing people_case_df to become 2021-01-12 to 2021-04-04
#step 2: create new dataframe holding data from both dataframes
#step 3: create new column that does people_vaccinated + .2 * people_with_covid for every row
#step 4: run predictions on this new combined column for more accurate prediction

In [4]:
print("Scenario Two: reaching herd immunity by looking at vaccinations AND covid cases")

us_pop = 332915073 #population value from macrotrends

#read in same vaccination dataset from earlier and a new covid cases dataset from the CDC
covid_cases = pd.read_csv('United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv')
sys_df = pd.read_csv("us-covid-19-total-people-vaccinated.csv")
#remove entries that don't correspond to the fifty states
unwanted_regions = ["American Samoa", "Bureau of Prisons", "Dept of Defense", "Federated States of Micronesia", "Guam", "Long Term Care", "Indian Health Svc", "Marshall Islands", "Northern Mariana Islands", "Puerto Rico", "Republic of Palau", "Virgin Islands", "Veterans Health", "United States"]
clean_df = sys_df[sys_df['Entity'].isin(unwanted_regions) == False]

#create a single entry for each day containing the total number of Covid cases across the US
total_cases = covid_cases.groupby('submission_date').sum()
total_cases = total_cases.reset_index()
total_cases['submission_date'] = pd.to_datetime(total_cases['submission_date'])
total_cases = total_cases.sort_values(by="submission_date")

#filter covid cases dataset down to the matching dates of the vaccination dataset
start_date = '2021-01-12'
end_date = '2021-04-04'
mask = (total_cases['submission_date'] >= start_date) & (total_cases['submission_date'] <= end_date)
total_cases_df = total_cases[(total_cases['submission_date'] >= start_date) & (total_cases['submission_date'] <= end_date)]

total_cases_df = total_cases_df.rename(columns={"submission_date": "Day"})

total_vax = clean_df.groupby('Day').sum()

total_vax = total_vax.reset_index()

total_vax['Day'] = pd.to_datetime(total_vax['Day'])

#merge the total number of covid cases and people vaccinated for each day columns
combined_df = total_vax.merge(total_cases_df, on='Day')

combined_df = combined_df.sort_values(by="Day")

#create a new column containing the total number of Covid cases + the total number of vaccinations for each day
combined_df['adjusted_vax'] = combined_df['people_vaccinated'] + combined_df['tot_cases']

combined_df = combined_df[['Day','adjusted_vax']]

combined_df.tail()

#sentiment analysis data was collected on April 28th and manually inputted into these variables

negative_tweet_percent = .4

positive_tweet_percent = .3

neutral_tweet_percent = .3

#prediction loop works exactly the same as above scenario's model
while combined_df.iloc[-1]['adjusted_vax'] < us_pop: 
    ten_days = combined_df.iloc[-9:]['adjusted_vax']

    ten_days_inc = ten_days.diff()

    ten_day_ave = np.mean(ten_days_inc)
    
    if (negative_tweet_percent > positive_tweet_percent) and (pd.to_datetime(combined_df.iloc[-1]['Day']) < pd.to_datetime('2021-05-28')): 
        ten_day_ave = ten_day_ave * 0.98

    next_day_val = combined_df.iloc[-1]['adjusted_vax'] + ten_day_ave

    next_day_date = (pd.to_datetime(combined_df.iloc[-1]['Day']) + pd.to_timedelta(np.ceil(1), unit="D")).date()
        
    combined_df = combined_df.append({'Day': next_day_date, 'adjusted_vax': next_day_val}, ignore_index=True)

combined_df['percent_vaccinated'] = combined_df['adjusted_vax'] / us_pop

combined_df.tail()



Scenario Two: reaching herd immunity assuming that 20% of people vaccinated had Covid


Unnamed: 0,Day,adjusted_vax,percent_vaccinated
195,2021-07-30,327097900.0,0.982527
196,2021-07-31,328657400.0,0.987211
197,2021-08-01,330216900.0,0.991895
198,2021-08-02,331776400.0,0.99658
199,2021-08-03,333335900.0,1.001264


In [5]:
print("Looking at scenario two's dataset predictions, we can see that the country reaches 100% herd immunity on July 28th, 2021")

Looking at scenario two's dataset predictions, we can see that the country reaches 100% herd immunity on July 28th, 2021
