In [1]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree

# Suppress all warnings
import warnings
warnings.simplefilter("ignore")

plt.style.use('fivethirtyeight')

### Starting with Lag Features

In [5]:
# import data
df = pd.read_csv('df_post_eda.csv')
df.head()

Unnamed: 0,Date,MMWR_week,Location,Distributed,Distributed_Janssen,Distributed_Moderna,Distributed_Pfizer,Distributed_Unk_Manuf,Dist_Per_100K,Distributed_Per_100k_5Plus,...,day_of_week,quarter,dayofyear,dayofmonth,weekofyear,season,holidays,holiday_season,vax_announcement,Additional_Doses_Vax_Pct_1_w_lag
0,2023-05-10,19,NE,5481710,152400,1647380,2905630,0,283379,303944.0,...,Wednesday,2,130,10,19,spring,Normal day,False,False,
1,2023-05-10,19,LA,10282120,330500,3807980,5164550,0,221178,236516.0,...,Wednesday,2,130,10,19,spring,Normal day,False,False,55.8
2,2023-05-10,19,GA,28727475,869100,9763000,14773655,0,270569,288404.0,...,Wednesday,2,130,10,19,spring,Normal day,False,False,43.3
3,2023-05-10,19,WY,1281755,49300,490040,585605,0,221466,235691.0,...,Wednesday,2,130,10,19,spring,Normal day,False,False,44.3
4,2023-05-10,19,CO,17769135,501900,5402640,9029715,0,308560,327449.0,...,Wednesday,2,130,10,19,spring,Normal day,False,False,47.3


In [6]:
pred_var = 'Additional_Doses_Vax_Pct'
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

In [7]:
grouped_data = df.groupby(df.index).mean()
grouped_data.head()

Unnamed: 0_level_0,MMWR_week,Distributed,Distributed_Janssen,Distributed_Moderna,Distributed_Pfizer,Distributed_Unk_Manuf,Dist_Per_100K,Distributed_Per_100k_5Plus,Distributed_Per_100k_12Plus,Distributed_Per_100k_18Plus,...,Additional_Doses_Unk_Manuf,month,year,quarter,dayofyear,dayofmonth,weekofyear,holiday_season,vax_announcement,Additional_Doses_Vax_Pct_1_w_lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-03,44.0,16631880.0,818798.4375,6480709.0,9242425.0,89947.65625,146818.0625,152028.625,167037.25,186038.703125,...,249.390625,11.0,2021.0,4.0,307.0,3.0,44.0,0.0,0.0,10.367188
2021-11-10,45.0,17145370.0,836476.5625,6642025.0,9666864.0,0.0,151460.53125,156997.78125,173446.453125,194278.203125,...,243.15625,11.0,2021.0,4.0,314.0,10.0,45.0,1.0,0.0,13.204688
2021-11-17,46.0,17683020.0,853621.875,6841471.0,9987929.0,0.0,155606.890625,161287.484375,178169.640625,199577.8125,...,272.46875,11.0,2021.0,4.0,321.0,17.0,46.0,0.0,1.0,15.907813
2021-11-24,47.0,18114900.0,874050.0,6908257.0,10332590.0,0.0,159194.265625,165021.546875,182287.59375,204172.5625,...,310.875,11.0,2021.0,4.0,328.0,24.0,47.0,1.0,0.0,18.985937
2021-12-01,48.0,18226990.0,882312.5,6931029.0,10413640.0,0.0,160201.109375,166069.109375,183443.203125,205459.15625,...,332.1875,12.0,2021.0,4.0,335.0,1.0,48.0,0.0,1.0,20.384375


In [8]:
def process(df):
    lag1df = df.shift(1)
    lag1df.columns = ['lag1_' + str(col) for col in lag1df.columns]
    df=grouped_data[['Additional_Doses_Vax_Pct']].join(lag1df)
    return df.fillna(0) 
df = process(grouped_data)
df.drop(columns=['lag1_Additional_Doses_Vax_Pct_1_w_lag'], inplace=True)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Date,Additional_Doses_Vax_Pct,lag1_MMWR_week,lag1_Distributed,lag1_Distributed_Janssen,lag1_Distributed_Moderna,lag1_Distributed_Pfizer,lag1_Distributed_Unk_Manuf,lag1_Dist_Per_100K,lag1_Distributed_Per_100k_5Plus,...,lag1_Additional_Doses_Janssen,lag1_Additional_Doses_Unk_Manuf,lag1_month,lag1_year,lag1_quarter,lag1_dayofyear,lag1_dayofmonth,lag1_weekofyear,lag1_holiday_season,lag1_vax_announcement
0,2021-11-03,10.332812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-11-10,13.185937,44.0,16631880.0,818798.4375,6480709.0,9242425.0,89947.65625,146818.0625,152028.625,...,4213.171875,249.390625,11.0,2021.0,4.0,307.0,3.0,44.0,0.0,0.0
2,2021-11-17,15.871875,45.0,17145370.0,836476.5625,6642025.0,9666864.0,0.0,151460.53125,156997.78125,...,8073.71875,243.15625,11.0,2021.0,4.0,314.0,10.0,45.0,1.0,0.0
3,2021-11-24,18.675,46.0,17683020.0,853621.875,6841471.0,9987929.0,0.0,155606.890625,161287.484375,...,11678.015625,272.46875,11.0,2021.0,4.0,321.0,17.0,46.0,0.0,1.0
4,2021-12-01,20.689062,47.0,18114900.0,874050.0,6908257.0,10332590.0,0.0,159194.265625,165021.546875,...,15452.34375,310.875,11.0,2021.0,4.0,328.0,24.0,47.0,1.0,0.0


In [9]:
train_size = int(0.7 * len(df))  # 70% training data, 30% testing data
train = df[:train_size]
test = df[train_size:]

print(train.shape, test.shape) # 70-30 train split

(56, 70) (24, 70)
