In [1]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

  from pandas import MultiIndex, Int64Index


In [6]:
# Import data
df = pd.read_csv('df_post_eda.csv')

# Predictive variable
pred_var = 'Additional_Doses_Vax_Pct' 

In [5]:
df.columns

Index(['Date', 'MMWR_week', 'Location', 'Distributed', 'Distributed_Janssen',
       'Distributed_Moderna', 'Distributed_Pfizer', 'Distributed_Unk_Manuf',
       'Dist_Per_100K', 'Distributed_Per_100k_5Plus',
       'Distributed_Per_100k_12Plus', 'Distributed_Per_100k_18Plus',
       'Distributed_Per_100k_65Plus', 'Administered', 'Administered_5Plus',
       'Administered_12Plus', 'Administered_18Plus', 'Administered_65Plus',
       'Administered_Janssen', 'Administered_Moderna', 'Administered_Pfizer',
       'Administered_Unk_Manuf', 'Admin_Per_100K', 'Admin_Per_100k_5Plus',
       'Admin_Per_100k_12Plus', 'Admin_Per_100k_18Plus',
       'Admin_Per_100k_65Plus', 'Recip_Administered',
       'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_5PlusPop_Pct',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65PlusPop_Pct', 'Series_Complete_Pop_Pct',
       'Series_Complete_5PlusPop_Pct', 'Series_Complet

In [7]:
# Additional feature engineering
df = df.drop(columns='is_segmented_outlier')

# Tune with for-loop
df['rolling_mean'] = df[pred_var].shift().rolling(window=3).mean()
df['rolling_std'] = df[pred_var].shift().rolling(window=3).std()

In [8]:
# splitting the data (try 70,15,15)
train_size = int(len(df) * 0.8)

# Create two x_test and x_test2
train, test = df.iloc[:train_size], df.iloc[train_size:]

In [12]:
# Set up XGBoost data structures
X_train, y_train = train.drop(pred_var, axis=1), train[pred_var]
X_test, y_test = test.drop(pred_var, axis=1), test[pred_var]

# Convert categorical columns to one-hot encoded columns
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure columns in X_train and X_test are the same after one-hot encoding
missing_cols = set(X_train.columns) - set(X_test.columns)
    # Create a new DataFrame missing_data with the same index as X_test, columns as missing_cols, and fill it with 
    # zeros.
missing_data = pd.DataFrame(0, index=X_test.index, columns=list(missing_cols))
    # Concatenate X_test and missing_data side by side (using axis=1) and then reorder the columns to match 
    # X_train.
X_test = pd.concat([X_test, missing_data], axis=1)[X_train.columns]

data_train = xgb.DMatrix(X_train, label=y_train)
data_test = xgb.DMatrix(X_test, label=y_test)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [17]:
# Parameters for the model
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'learning_rate': 0.01
}

# Training the model with the specified number of boosting rounds
# Put one test group in evals and one just after model is trained
model = xgb.train(params, data_train, num_boost_round=1000, evals=[(data_train, 'train'), (data_test, 'test')])


[0]	train-rmse:47.79882	test-rmse:30.43308
[1]	train-rmse:47.32259	test-rmse:30.12923
[2]	train-rmse:46.85111	test-rmse:29.82266
[3]	train-rmse:46.38437	test-rmse:29.52281
[4]	train-rmse:45.92225	test-rmse:29.22581
[5]	train-rmse:45.46477	test-rmse:28.93159
[6]	train-rmse:45.01186	test-rmse:28.64017
[7]	train-rmse:44.56349	test-rmse:28.35367
[8]	train-rmse:44.11959	test-rmse:28.06803
[9]	train-rmse:43.68013	test-rmse:27.78554
[10]	train-rmse:43.24505	test-rmse:27.50787
[11]	train-rmse:42.81433	test-rmse:27.23454
[12]	train-rmse:42.38789	test-rmse:26.96070
[13]	train-rmse:41.96573	test-rmse:26.69288
[14]	train-rmse:41.54778	test-rmse:26.42637
[15]	train-rmse:41.13400	test-rmse:26.16076
[16]	train-rmse:40.72435	test-rmse:25.90125
[17]	train-rmse:40.31880	test-rmse:25.64059
[18]	train-rmse:39.91730	test-rmse:25.38654
[19]	train-rmse:39.51982	test-rmse:25.12804
[20]	train-rmse:39.12629	test-rmse:24.87904
[21]	train-rmse:38.73671	test-rmse:24.63009
[22]	train-rmse:38.35099	test-rmse:24.3847

[187]	train-rmse:7.37835	test-rmse:4.34481
[188]	train-rmse:7.30526	test-rmse:4.30119
[189]	train-rmse:7.23289	test-rmse:4.25840
[190]	train-rmse:7.16124	test-rmse:4.21494
[191]	train-rmse:7.09031	test-rmse:4.17246
[192]	train-rmse:7.02010	test-rmse:4.13026
[193]	train-rmse:6.95057	test-rmse:4.08924
[194]	train-rmse:6.88175	test-rmse:4.04810
[195]	train-rmse:6.81361	test-rmse:4.00820
[196]	train-rmse:6.74613	test-rmse:3.96899
[197]	train-rmse:6.67933	test-rmse:3.92961
[198]	train-rmse:6.61320	test-rmse:3.89160
[199]	train-rmse:6.54772	test-rmse:3.85342
[200]	train-rmse:6.48293	test-rmse:3.81795
[201]	train-rmse:6.41876	test-rmse:3.78110
[202]	train-rmse:6.35522	test-rmse:3.74398
[203]	train-rmse:6.29233	test-rmse:3.70738
[204]	train-rmse:6.23005	test-rmse:3.67043
[205]	train-rmse:6.16841	test-rmse:3.63348
[206]	train-rmse:6.10736	test-rmse:3.59804
[207]	train-rmse:6.04695	test-rmse:3.56419
[208]	train-rmse:5.98712	test-rmse:3.52933
[209]	train-rmse:5.92788	test-rmse:3.49484
[210]	train

[378]	train-rmse:1.13267	test-rmse:1.75580
[379]	train-rmse:1.12198	test-rmse:1.75654
[380]	train-rmse:1.11140	test-rmse:1.75710
[381]	train-rmse:1.10091	test-rmse:1.75791
[382]	train-rmse:1.09053	test-rmse:1.75867
[383]	train-rmse:1.08030	test-rmse:1.75860
[384]	train-rmse:1.07015	test-rmse:1.75954
[385]	train-rmse:1.06010	test-rmse:1.76052
[386]	train-rmse:1.05018	test-rmse:1.76051
[387]	train-rmse:1.04032	test-rmse:1.76150
[388]	train-rmse:1.03058	test-rmse:1.76156
[389]	train-rmse:1.02094	test-rmse:1.76232
[390]	train-rmse:1.01141	test-rmse:1.76312
[391]	train-rmse:1.00195	test-rmse:1.76317
[392]	train-rmse:0.99259	test-rmse:1.76368
[393]	train-rmse:0.98333	test-rmse:1.76389
[394]	train-rmse:0.97418	test-rmse:1.76406
[395]	train-rmse:0.96512	test-rmse:1.76415
[396]	train-rmse:0.95614	test-rmse:1.76508
[397]	train-rmse:0.94724	test-rmse:1.76521
[398]	train-rmse:0.93846	test-rmse:1.76550
[399]	train-rmse:0.92975	test-rmse:1.76586
[400]	train-rmse:0.92114	test-rmse:1.76611
[401]	train

[569]	train-rmse:0.24976	test-rmse:1.85883
[570]	train-rmse:0.24847	test-rmse:1.85893
[571]	train-rmse:0.24721	test-rmse:1.85932
[572]	train-rmse:0.24598	test-rmse:1.86010
[573]	train-rmse:0.24470	test-rmse:1.86026
[574]	train-rmse:0.24345	test-rmse:1.86035
[575]	train-rmse:0.24221	test-rmse:1.86067
[576]	train-rmse:0.24094	test-rmse:1.86098
[577]	train-rmse:0.23974	test-rmse:1.86113
[578]	train-rmse:0.23854	test-rmse:1.86146
[579]	train-rmse:0.23741	test-rmse:1.86222
[580]	train-rmse:0.23629	test-rmse:1.86260
[581]	train-rmse:0.23515	test-rmse:1.86271
[582]	train-rmse:0.23396	test-rmse:1.86300
[583]	train-rmse:0.23285	test-rmse:1.86308
[584]	train-rmse:0.23168	test-rmse:1.86336
[585]	train-rmse:0.23056	test-rmse:1.86383
[586]	train-rmse:0.22951	test-rmse:1.86416
[587]	train-rmse:0.22841	test-rmse:1.86443
[588]	train-rmse:0.22737	test-rmse:1.86455
[589]	train-rmse:0.22630	test-rmse:1.86481
[590]	train-rmse:0.22532	test-rmse:1.86549
[591]	train-rmse:0.22432	test-rmse:1.86560
[592]	train

[760]	train-rmse:0.15332	test-rmse:1.88412
[761]	train-rmse:0.15321	test-rmse:1.88416
[762]	train-rmse:0.15303	test-rmse:1.88421
[763]	train-rmse:0.15289	test-rmse:1.88431
[764]	train-rmse:0.15275	test-rmse:1.88433
[765]	train-rmse:0.15262	test-rmse:1.88438
[766]	train-rmse:0.15248	test-rmse:1.88452
[767]	train-rmse:0.15235	test-rmse:1.88453
[768]	train-rmse:0.15215	test-rmse:1.88453
[769]	train-rmse:0.15204	test-rmse:1.88456
[770]	train-rmse:0.15187	test-rmse:1.88467
[771]	train-rmse:0.15174	test-rmse:1.88460
[772]	train-rmse:0.15162	test-rmse:1.88464
[773]	train-rmse:0.15150	test-rmse:1.88461
[774]	train-rmse:0.15138	test-rmse:1.88465
[775]	train-rmse:0.15126	test-rmse:1.88473
[776]	train-rmse:0.15116	test-rmse:1.88468
[777]	train-rmse:0.15100	test-rmse:1.88478
[778]	train-rmse:0.15088	test-rmse:1.88483
[779]	train-rmse:0.15069	test-rmse:1.88482
[780]	train-rmse:0.15055	test-rmse:1.88486
[781]	train-rmse:0.15045	test-rmse:1.88492
[782]	train-rmse:0.15036	test-rmse:1.88496
[783]	train

[951]	train-rmse:0.13612	test-rmse:1.88955
[952]	train-rmse:0.13606	test-rmse:1.88968
[953]	train-rmse:0.13600	test-rmse:1.88967
[954]	train-rmse:0.13593	test-rmse:1.88957
[955]	train-rmse:0.13587	test-rmse:1.88957
[956]	train-rmse:0.13580	test-rmse:1.88962
[957]	train-rmse:0.13573	test-rmse:1.88964
[958]	train-rmse:0.13568	test-rmse:1.88949
[959]	train-rmse:0.13564	test-rmse:1.88952
[960]	train-rmse:0.13556	test-rmse:1.88963
[961]	train-rmse:0.13549	test-rmse:1.88960
[962]	train-rmse:0.13538	test-rmse:1.88969
[963]	train-rmse:0.13531	test-rmse:1.88968
[964]	train-rmse:0.13526	test-rmse:1.88968
[965]	train-rmse:0.13518	test-rmse:1.88965
[966]	train-rmse:0.13512	test-rmse:1.88967
[967]	train-rmse:0.13506	test-rmse:1.88971
[968]	train-rmse:0.13499	test-rmse:1.88969
[969]	train-rmse:0.13493	test-rmse:1.88973
[970]	train-rmse:0.13484	test-rmse:1.88981
[971]	train-rmse:0.13478	test-rmse:1.88977
[972]	train-rmse:0.13471	test-rmse:1.88915
[973]	train-rmse:0.13464	test-rmse:1.88928
[974]	train

In [19]:
# Forecasting and Evaluation

y_pred = model.predict(data_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Compute RMSE
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error: 3.563267669060175
Root Mean Squared Error (RMSE): 1.8876619583654737
