# Measuring and predicing the efficiency of COVID-19 vaccine administration in the US

Pacakges needed:

In [None]:
import pandas as pd             # to use Dataframes
# import matplotlib.pyplot as plt # to plot results
import numpy as np              # to perform mathematical operations
import plotly.express as px     # for interactive plots
import datetime

Our dataset:
https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations

In [None]:
df_owid_orig = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/us_state_vaccinations.csv')
df_owid_orig.head()

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.45,7.69,,,,0.207
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.53,7.73,5906.0,5906.0,1205.0,0.222
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.28,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,


*   **date**: date of the observation.
*   **location**: name of the state or federal entity.
*   **total_vaccinations**: total number of doses administered. This is counted as a single dose, and may not equal the total number of people vaccinated, depending on the specific dose regime (e.g. people receive multiple doses). If a person receives one dose of the vaccine, this metric goes up by 1. If they receive a second dose, it goes up by 1 again.
*   **total_distributed**: cumulative counts of COVID-19 vaccine doses recorded as shipped in CDC's Vaccine Tracking System.
*   **people_vaccinated**: total number of people who received at least one vaccine dose. If a person receives the first dose of a 2-dose vaccine, this metric goes up by 1. If they receive the second dose, the metric stays the same.
*   **people_fully_vaccinated**: total number of people who received all doses prescribed by the vaccination protocol. If a person receives the first dose of a 2-dose vaccine, this metric stays the same. If they receive the second dose, the metric goes up by 1.
*   **daily_vaccinations_raw**: daily change in the total number of doses administered. It is only calculated for consecutive days. This is a raw measure provided for data checks and transparency, but we strongly recommend that any analysis on daily vaccination rates be conducted using daily_vaccinations instead.
*   **daily_vaccinations**: new doses administered per day (7-day smoothed). For countries that don't report data on a daily basis, we assume that doses changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window.
*   **share_doses_used**: share of vaccination doses administered among those recorded as shipped in CDC's Vaccine Tracking System.

Index our data by location, date and create a new 'days_since' column that will describe the number of days since the first recorded vaccine.

In [None]:
# Work with a copy, not the original dataset
df_owid = df_owid_orig.copy()

# Store the location and date as indices for the dataframe
df_owid.sort_values(by=['location', 'date'])
df_owid.loc[:, 'datetime'] = pd.to_datetime(df_owid.loc[:, 'date'])
df_owid = df_owid.set_index(['location', 'datetime'], drop=False)
df_owid.rename_axis(['location_index', 'datetime_index'], axis='index', inplace=True)

# Count the number of days since beginning of recording - helps with regression model
df_owid['days_since'] = (df_owid['datetime'] - df_owid['datetime'].iloc[0]).dt.days

# df_owid_clean = df_owid[['']]
# for state in df_owid.index.levels[0]:
    
# owid_vpop_USA.asfreq(freq='D')

# df.fillna(method='ffill', inplace=True)
# df.dropna(inplace=True)

df_owid

Unnamed: 0_level_0,Unnamed: 1_level_0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,datetime,days_since
location_index,datetime_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alabama,2021-01-12,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.45,7.69,,,,0.207,2021-01-12,0
Alabama,2021-01-13,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.53,7.73,5906.0,5906.0,1205.0,0.222,2021-01-13,1
Alabama,2021-01-14,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212,2021-01-14,2
Alabama,2021-01-15,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.28,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226,2021-01-15,3
Alabama,2021-01-16,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,,2021-01-16,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,2021-05-01,2021-05-01,Wyoming,358222.0,483195.0,197208.0,28.01,61.89,162127.0,34.07,83.49,2165.0,2476.0,4278.0,0.741,2021-05-01,109
Wyoming,2021-05-02,2021-05-02,Wyoming,358512.0,483195.0,197340.0,28.05,61.94,162328.0,34.10,83.49,290.0,2484.0,4292.0,0.742,2021-05-02,110
Wyoming,2021-05-03,2021-05-03,Wyoming,358545.0,483195.0,197374.0,28.05,61.95,162356.0,34.10,83.49,33.0,1409.0,2435.0,0.742,2021-05-03,111
Wyoming,2021-05-04,2021-05-04,Wyoming,361199.0,483595.0,198293.0,28.34,62.41,163997.0,34.26,83.56,2654.0,1786.0,3086.0,0.747,2021-05-04,112


We can now selected the data for a certain location and drop rows with NaN cells:

In [None]:
# for state in df_owid.index.levels[0]:
#     print(df_owid.loc[state].index[0])

In [None]:
df_owid_narrow = df_owid['days_since']

In [None]:
# ListofStates = ['Alabama', 'Texas', 'New York State']

In [None]:
# train_date_limit = datetime.datetime.strptime('2021-03-15', '%Y-%m-%d')

In [None]:
# for Stateidx, State in enumerate(ListofStates):
#     TrainData = df_owid_narrow.loc[State][dates[0]:dates[1]]
#     TestData = df_owid_narrow.loc[State][dates[2]:dates[3]]

In [None]:
# old way of selecting state and date

df_state = df_owid.loc['Texas']
df_state['days_since'] = (df_state['datetime'] - df_state['datetime'].iloc[0]).dt.days
df_state = df_state.dropna(subset=['people_vaccinated_per_hundred']).dropna()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df_state

Unnamed: 0_level_0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,datetime,days_since
datetime_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-01-13,2021-01-13,Texas,978042.0,1984225.0,861072.0,0.40,3.37,115965.0,2.97,6.84,66581.00,66581.0,2296.0,0.493,2021-01-13,1
2021-01-15,2021-01-15,Texas,1160242.0,2105600.0,1003412.0,0.54,4.00,155660.0,3.46,7.26,116618.00,82927.0,2860.0,0.551,2021-01-15,3
2021-01-19,2021-01-19,Texas,1403989.0,2105600.0,1222952.0,0.62,4.84,179801.0,4.22,7.26,60936.75,70361.0,2427.0,0.667,2021-01-19,7
2021-01-20,2021-01-20,Texas,1471746.0,2551125.0,1282998.0,0.65,5.08,187503.0,4.42,8.80,67757.00,70529.0,2432.0,0.577,2021-01-20,8
2021-01-21,2021-01-21,Texas,1544551.0,2754800.0,1337706.0,0.71,5.33,205538.0,4.61,9.50,72805.00,71561.0,2468.0,0.561,2021-01-21,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-01,2021-05-01,Texas,19090255.0,25514685.0,11164737.0,27.46,65.84,7960964.0,38.50,87.99,204584.00,186869.0,6445.0,0.748,2021-05-01,109
2021-05-02,2021-05-02,Texas,19205116.0,25514685.0,11199365.0,27.72,66.23,8037727.0,38.62,87.99,114861.00,175147.0,6040.0,0.753,2021-05-02,110
2021-05-03,2021-05-03,Texas,19276649.0,25514685.0,11226347.0,27.87,66.48,8081630.0,38.72,87.99,71533.00,168987.0,5828.0,0.756,2021-05-03,111
2021-05-04,2021-05-04,Texas,19350193.0,26000725.0,11253017.0,28.04,66.73,8130177.0,38.81,89.67,73544.00,161661.0,5575.0,0.744,2021-05-04,112


In [None]:
# def train_test_dates:

#     print('Please insert dates in "YYYY-MM-DD" format:\ne.g. "January 4, 2005" is "2005-01-04"')

#     dates = [[]]*4
#     date_input_str = ['Train data from:',
#                       'Train data to  :',
#                       'Test  data from:',
#                       'Test  data to  :']

#     for idx, val in enumerate(train_test_dates):
#         while True:
#             try:
#                 train_test_dates[idx] = datetime.datetime.strptime(input(date_input_str[idx]), '%Y-%m-%d')
#             except ValueError:
#                 print('Please provide a date in "YYYY-MM-DD" format')
#             else:
#                 break
    
#     return dates

In [None]:
# def split_date_range(X, y, dates):
#     X_train = (df.index.levels[1] - df.index.levels[1][0]).days.values.reshape(-1, 1)

## Visual Models

View share_doses_used as a function of datetime for a particular state(s) of the user's choice

In [None]:
State1 = []
state_input1 = ''
AllStates_lower = [ df_owid.index.levels[0][state_idx].lower() for state_idx, state in enumerate(df_owid.index.levels[0]) ]

while state_input1 != 'done':
    if len(State1) < 1:
        state_input1 = input('Please enter a state: ')
    else:
        state_input1 = input('Please enter a another state or "done": ')
    if state_input1.lower() in AllStates_lower:
        state_curr = df_owid.index.levels[0][AllStates_lower.index(state_input1.lower())]
        State1.append(state_curr)
    elif state_input1 == 'done':
        if len(State1) < 1:
            print('You have not selected a state yet.')
        else:
            print('\nDone. List of States:', ', '.join(State1))
            break
    else:
        print('"{}" is not a valid state. Please enter a valid state.'.format(state_input1))

#ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

state_info = df_owid.copy()
state_info = state_info[0:0]
for state in State1:
  state_info = pd.concat([state_info, df_owid[(df_owid['location']==state)]], axis=0)

fig = px.line(state_info, x="datetime", y="share_doses_used", color="location", title="layout.hovermode='closest' (the default)")

fig.update_traces(mode="markers+lines")
fig.show()

Please enter a state: ohio
Please enter a another state or "done": alabama
Please enter a another state or "done": texas
Please enter a another state or "done": new york state
Please enter a another state or "done": done

Done. List of States: Ohio, Alabama, Texas, New York State


View people_vaccinated_per_hundreda as a function of datetime for a particular state(s) of the user's choice

In [None]:
State2 = []
state_input2 = ''
AllStates_lower = [ df_owid.index.levels[0][state_idx].lower() for state_idx, state in enumerate(df_owid.index.levels[0]) ]

while state_input2 != 'done':
    if len(State2) < 1:
        state_input2 = input('Please enter a state: ')
    else:
        state_input2 = input('Please enter a another state or "done": ')
    if state_input2.lower() in AllStates_lower:
        state_curr = df_owid.index.levels[0][AllStates_lower.index(state_input2.lower())]
        State2.append(state_curr)
    elif state_input2 == 'done':
        if len(State2) < 1:
            print('You have not selected a state yet.')
        else:
            print('\nDone. List of States:', ', '.join(State2))
            break
    else:
        print('"{}" is not a valid state. Please enter a valid state.'.format(state_input2))

#ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

state_info2 = state_info.copy()
state_info2 = state_info2[0:0]
for state in State2:
  state_info2 = pd.concat([state_info2, df_owid[(df_owid['location']==state)]], axis=0)

fig = px.line(state_info2, x="datetime", y="people_vaccinated_per_hundred", color="location", title="layout.hovermode='closest' (the default)")

fig.update_traces(mode="markers+lines")
fig.show()

Please enter a state: alabama
Please enter a another state or "done": texas
Please enter a another state or "done": done

Done. List of States: Alabama, Texas


In [None]:
### UNSURE IF THIS CODE IS NECESSARY NOW BASED ON THE UPPER CODE
state2 = input('What state? ')
if state2.lower() in AllStates_lower:
        #state_curr = df_owid.index.levels[0][AllStates_lower.index(state_input2.lower())]
        state_case2 =  df_owid[(df_owid['location']==state2)]
else:
  print('"{}" is not a valid state. Please enter a valid state.'.format(state2))
#           state_input2 = input('Please enter a another state or "done": ')
# elif state2 == 'done':
#         if len(state_case2) < 1:
#             print('You have not selected a state yet.')
# index2 = (df_owid['location']==state2).index
#state_case2 = df_owid[(df_owid['location']==state2)]
state_case2

What state? alabama


Unnamed: 0_level_0,Unnamed: 1_level_0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,datetime,days_since
location_index,datetime_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [None]:
### UNSURE IF THIS CODE IS NECESSARY NOW BASED ON THE UPPER CODE
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

first_line=go.Scatter(x=state_case1['datetime'], y=state_case1['share_doses_used'], name='Texas_share')
second_line=go.Scatter(x=state_case2['datetime'], y=state_case2['share_doses_used'], name='Alabama_share')
third_line=go.Scatter(x=state_case1['datetime'], y=state_case1['people_vaccinated_per_hundred'], name='Texas_percent')
fourth_line=go.Scatter(x=state_case2['datetime'], y=state_case2['people_vaccinated_per_hundred'], name='Alabama_percent')

fig = make_subplots(rows=1,cols=2)
fig.add_trace(first_line, row=1, col=1)
fig.add_trace(second_line, row=1, col=1)
fig.add_trace(third_line, row=1, col=2)
fig.add_trace(fourth_line, row=1, col=2)
fig.update_layout(hovermode='x')
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ))
fig.show()

NameError: ignored

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

for state in List


first_line=go.Scatter(x=state_case1['datetime'], y=state_case1['share_doses_used'], name='Texas_share')
second_line=go.Scatter(x=state_case2['datetime'], y=state_case2['share_doses_used'], name='Alabama_share')
third_line=go.Scatter(x=state_case1['datetime'], y=state_case1['people_vaccinated_per_hundred'], name='Texas_percent')
fourth_line=go.Scatter(x=state_case2['datetime'], y=state_case2['people_vaccinated_per_hundred'], name='Alabama_percent')

fig = make_subplots(rows=2,cols=1,shared_xaxes=True)
fig.add_trace(first_line, row=1, col=1)
fig.add_trace(second_line, row=1, col=1)
fig.add_trace(third_line, row=2, col=1)
fig.add_trace(fourth_line, row=2, col=1)

In [None]:
donetrain_date_end = datetime.datetime.strptime('2021-03-15', '%Y-%m-%d')
train_date_start = train_date_end - datetime.timedelta(5)

In [None]:
from sklearn.linear_model import LinearRegression

def getLinreg(X_trn, y_trn):
    mdl = LinearRegression()
    mdl.fit(X_trn, y_trn)
    
    return mdl

In [None]:
TrainData = df_owid.loc[state][train_date_start:train_date_end]

ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

X_train = TrainData['days_since']
Y_train = TrainData[ListofCols]

In [None]:
ListofStates = []
state_input = ''
AllStates_lower = [ df_owid.index.levels[0][state_idx].lower() for state_idx, state in enumerate(df_owid.index.levels[0]) ]

while state_input != 'done':
    if len(ListofStates) < 1:
        state_input = input('Please enter a state: ')
    else:
        state_input = input('Please enter a another state or "done": ')
    if state_input.lower() in AllStates_lower:
        state = df_owid.index.levels[0][AllStates_lower.index(state_input.lower())]
        ListofStates.append(state)
    elif state_input == 'done':
        if len(ListofStates) < 1:
            print('You have not selected a state yet.')
        else:
            print('\nDone. List of States:', ', '.join(ListofStates))
            break
    else:
        print('"{}" is not a valid state. Please enter a valid state.'.format(state_input))

Please enter a state: Texas
Please enter a another state or "done": Alabama
Please enter a another state or "done": Ohio
Please enter a another state or "done": done

Done. List of States: Texas, Alabama, Ohio


In [None]:
earliest_date = df_owid.index.levels[1][0]

In [None]:
days_ahead = 200
forecast_dates = pd.date_range(train_date_end, train_date_end+datetime.timedelta(days_ahead))

ListofCols_pred = [ 'PREDICTED'+' '+col for col in ListofCols ]

Y_pred_state = []
indices = [[ state for day in range(days_ahead+1) ], forecast_dates]
for state in ListofStates:
    Y_pred_state.append( pd.DataFrame(columns=ListofCols, index=indices) )
Y_pred = pd.concat(Y_pred_state)

y_pred_list = []
y_p_state_list = []

for state in ListofStates:
    for idx, col in enumerate(Y_train.columns):
        slope = getLinreg(np.array(X_train).reshape(-1, 1), Y_train[col]).coef_[0]
        train_end_point = (Y_train.index[-1], Y_train[col][-1])
        y_linreg = [ train_end_point[1] + slope*day for day in range(days_ahead+1) ]
        y_forecast = pd.DataFrame(data=y_linreg, columns=[col], index=forecast_dates)
        y_pred_list.append(y_forecast)
    y_p_state_list.append(y_pred_list[-1].join(y_pred_list[-2]))

Y_pred = pd.concat(y_p_state_list, keys=ListofStates)
for col_idx, col in enumerate(ListofCols):
    Y_pred.rename(columns={col: ListofCols_pred[col_idx]}, inplace=True)

In [None]:
Y_pred['PREDICTED share_doses_used']

Texas  2021-03-15    0.7790
       2021-03-16    0.7922
       2021-03-17    0.8054
       2021-03-18    0.8186
       2021-03-19    0.8318
                      ...  
Ohio   2021-09-27    3.3662
       2021-09-28    3.3794
       2021-09-29    3.3926
       2021-09-30    3.4058
       2021-10-01    3.4190
Name: PREDICTED share_doses_used, Length: 603, dtype: float64

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

ListofStates = ['Alabama', 'Texas', 'New York State']
ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

fig = make_subplots(rows=len(ListofCols)+1,cols=1,shared_xaxes=True)

for col_idx, col in enumerate(ListofCols):

    col_pred = ListofCols_pred[col_idx]

    for state in ListofStates:
        data = go.Scatter(x=df_owid.loc[state]['datetime'], y=df_owid.loc[state][col], name=state+' '+col)
        fig.add_trace(data, row=col_idx+1, col=1)

        data_pred = go.Scatter(x=Y_pred.loc[state].index, y=Y_pred.loc[state][col_pred], name=state+' '+col_pred)
        fig.add_trace(data_pred, row=col_idx+1, col=1)

fig.update_layout(hovermode='x')
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ))
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="share_doeses_used", row=1, col=1)
fig.update_yaxes(title_text="people_vaccinated_per_hundred", row=2, col=1)
fig.show()

KeyError: ignored

In [None]:
import pandas as pd                               # to organize data in Dataframes
import datetime                                   # to work with time series
import plotly.graph_objects as go                 # to generate interactive plots
from plotly.subplots import make_subplots         # to make interactive subplots
from sklearn.linear_model import LinearRegression # to predict linear forecasts

In [None]:
def clean_df(df_owid_orig):

    # Work with a copy, not the original dataset
    df_owid = df_owid_orig.copy()

    # Store the location and date as indices for the dataframe
    df_owid.sort_values(by=['location', 'date'])
    df_owid.loc[:, 'datetime'] = pd.to_datetime(df_owid.loc[:, 'date'])
    df_owid = df_owid.set_index(['location', 'datetime'], drop=False)
    df_owid.rename_axis(['location_index', 'datetime_index'], axis='index', inplace=True)

    # Count the number of days since beginning of recording - helps with regression model
    df_owid['days_since'] = (df_owid['datetime'] - df_owid['datetime'].iloc[0]).dt.days

    # Using forward fill method to fill missing values (fill missing values with previous available day's value)
    # Lets us avoid errors with the linear regression and is acceptable for small stretches of missing data
    df_owid.fillna(method='ffill', inplace=True)

    # Examine these columns in the forecast
    ListofCols = ['share_doeses_used', 'people_vaccinated_per_hundred']

    return df_owid, ListofCols

In [None]:
# Get user input of US states to examine

def getStateInput(df_owid):

    ListofStates = []
    state_input = ''
    AllStates_lower = [ df_owid.index.levels[0][state_idx].lower() for state_idx, state in enumerate(df_owid.index.levels[0]) ]

    while state_input != 'done':
        if len(ListofStates) < 1:
            state_input = input('Please enter a state: ')
        else:
            state_input = input('Please enter a another state or "done": ')
        if state_input.lower() in AllStates_lower:
            state = df_owid.index.levels[0][AllStates_lower.index(state_input.lower())]
            ListofStates.append(state)
        elif state_input == 'done':
            if len(ListofStates) < 1:
                print('You have not selected a state yet.')
            else:
                print('\nDone. List of States:', ', '.join(ListofStates))
                break
        else:
            print('"{}" is not a valid state. Please enter a valid state.'.format(state_input)) 

    return ListofStates

In [None]:
# Select Dates for Forecast - End date for training and end date for forecast

def getForecastDates():

    # Select End Date for Forecast Training

    train_date_end = ''

    train_date_earliest = datetime.datetime.strptime('2021-01-18', '%Y-%m-%d')
    train_date_latest = datetime.datetime.today() - datetime.timedelta(5)

    earliest_str = datetime.datetime.strftime(train_date_earliest, '%B %d, %Y')
    latest_str = datetime.datetime.strftime(train_date_latest, '%B %d, %Y')

    train_date_range = pd.date_range(train_date_earliest, train_date_latest)

    print('Select an end date to train the model.')
    print('e.g. "Based on trends up to date March 15, 2021, we can forecast the following trajectory."')

    train_date_end_input = ''
    while train_date_end not in train_date_range:
        try:
            train_date_end_input = input(('Please provide a date from {} to {} in the format "Month Day, Year": ').format(earliest_str, latest_str))
            train_date_end = datetime.datetime.strptime(train_date_end_input, '%B %d, %Y')
        except ValueError:
            print(train_date_end_input, 'is not in a valid format\nPlease provide a valid date in the format "Month Day, Year".')
        else:
            print('Done. Using', train_date_end_input, 'as end date for forecast training.')
            break


    # Select Furthest Future Date to Forecast

    forecast_date_end = ''
    forecast_date_end_input = ''

    print('Select an end date for forecast.')
    print('e.g. "We will predict the trends up to June 4, 2021.')

    while True:
        try:
            forecast_date_end_input = input('Please enter a date to end the forecast ("Month Day, Year"): ')
            forecast_date_end = datetime.datetime.strptime(forecast_date_end_input, '%B %d, %Y')
            while forecast_date_end <= train_date_end:
                too_early = 1
                print('Please choose a date after the train date of {}.'.format(train_date_end_input))
                forecast_date_end = datetime.datetime.strptime('', '%B %d, %Y')
                break
        except ValueError: 
            try:
                int(too_early)
                continue
            except ValueError:
                print(forecast_date_end_input, 'is not a valid date.')
                print('Please choose a valid date in the format "Month Day, Year".')
        else:
            print('Done. Selected', forecast_date_end_input,'as end date for forecast.')
            forecast_dates = pd.date_range(train_date_end, forecast_date_end)
            break

    return train_date_end, forecast_date_end, forecast_dates

In [None]:
def getLinreg(X_trn, y_trn):
    mdl = LinearRegression()
    mdl.fit(X_trn, y_trn)
    
    return mdl

In [None]:
# Forecast the selected columns' data using the train date limit and forecast span

def forecaster(df_owid, forecast_dates, ListofStates, ListofCols):
    
    days_ahead = len(forecast_dates)
    ListofCols_pred = [ 'PREDICTED'+' '+col for col in ListofCols ]

    Y_pred_state = []
    for state in ListofStates:
        indices = [[ state for day in range(days_ahead) ], forecast_dates]
        Y_pred_state.append( pd.DataFrame(columns=ListofCols, index=indices) )
    Y_pred = pd.concat(Y_pred_state)

    y_pred_list = []
    y_p_state_list = []

    for state in ListofStates:

        TrainData = df_owid.loc[state][train_date_start:train_date_end]
        X_train = TrainData['days_since']
        Y_train = TrainData[ListofCols]

        for idx, col in enumerate(Y_train.columns):
            slope = getLinreg(np.array(X_train).reshape(-1, 1)[-15:-1], Y_train[col][-15:-1]).coef_[0]
            train_end_point = (Y_train.index[-1], Y_train[col][-1])
            y_linreg = [ train_end_point[1] + slope*day for day in range(days_ahead) ]
            y_forecast = pd.DataFrame(data=y_linreg, columns=[col], index=forecast_dates)
            y_pred_list.append(y_forecast)
        y_p_state_list.append(y_pred_list[-1].join(y_pred_list[-2]))
    
    Y_pred = pd.concat(y_p_state_list, keys=ListofStates)
    for col_idx, col in enumerate(ListofCols):
        Y_pred.rename(columns={col: ListofCols_pred[col_idx]}, inplace=True)

    return Y_pred, ListofCols_pred

In [None]:
# Plot the data for each state, comparing actual and forecasted values
# This uses the plotly package for interactive graphs 

def plotStateData(df_owid, ListofStates, ListofCols, Y_pred, ListofCols_pred):

    # ListofStates = ['Alabama', 'Texas', 'New York State']
    # ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

    fig = make_subplots(rows=len(ListofCols)+1,cols=1,shared_xaxes=True)

    for col_idx, col in enumerate(ListofCols):

        col_pred = ListofCols_pred[col_idx]

        for state in ListofStates:
            data = go.Scatter(x=df_owid.loc[state]['datetime'], y=df_owid.loc[state][col], name=state+' '+col)
            fig.add_trace(data, row=col_idx+1, col=1)

            data_pred = go.Scatter(x=Y_pred.loc[state].index, y=Y_pred.loc[state][col_pred], name=state+' '+col_pred)
            fig.add_trace(data_pred, row=col_idx+1, col=1)

    fig.update_layout(hovermode='x')
    fig.update_layout(
        autosize=False,
        width=1000,
        height=1000,
        margin=dict(
            l=50,
            r=50,
            b=100,
            t=100,
            pad=4
        ))
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="share_doses_used", row=1, col=1)
    fig.update_yaxes(title_text="people_vaccinated_per_hundred", row=2, col=1)
    fig.show()

    fig_pred = fig

    return fig_pred

In [None]:
df_owid, ListofCols = clean_df(df_owid_orig)

In [None]:
ListofStates = getStateInput()

TypeError: ignored

In [None]:
train_date_end, forecast_date_end, forecast_dates = getForecastDates()

Select an end date to train the model.
e.g. "Based on trends up to date March 15, 2021, we can forecast the following trajectory."
Please provide a date from January 18, 2021 to May 01, 2021 in the format "Month Day, Year": April 20, 2021
Done. Using April 20, 2021 as end date for forecast training.
Select an end date for forecast.
e.g. "We will predict the trends up to June 4, 2021.
Please enter a date to end the forecast ("Month Day, Year"): April 20, 2021
Please choose a date after the train date of April 20, 2021.
Please enter a date to end the forecast ("Month Day, Year"): April 30, 2021
Done. Selected April 30, 2021 as end date for forecast.


In [None]:
ListofCols = ['share_doses_used', 'people_vaccinated_per_hundred']

Y_pred, ListofCols_pred = forecaster(df_owid, forecast_dates, ListofStates, ListofCols)

In [None]:
fig = plotStateData(df_owid, ListofStates, ListofCols, Y_pred, ListofCols_pred)