In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from division_aggregation_function import division_aggregation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

In [12]:
df = pd.read_csv("../data/covid_data1.csv", index_col=0)
df['region'] = df['location_key'].apply(division_aggregation)
df = df[df.columns[(df.isna().sum()/len(df) <= 0.1).values]]
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day

In [15]:
df.head()

Unnamed: 0,date,location_key,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,new_hospitalized_patients,cumulative_hospitalized_patients,current_hospitalized_patients,current_intensive_care_patients,...,new_vaccine_doses_administered_moderna,cumulative_vaccine_doses_administered_moderna,new_persons_fully_vaccinated_janssen,cumulative_persons_fully_vaccinated_janssen,new_vaccine_doses_administered_janssen,cumulative_vaccine_doses_administered_janssen,region,year,month,day
0,2020-03-06,US_AK,0.0,0.0,0.0,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,Pacific,2020,3,6
1,2020-03-07,US_AK,0.0,0.0,0.0,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,Pacific,2020,3,7
2,2020-03-08,US_AK,0.0,0.0,0.0,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,Pacific,2020,3,8
3,2020-03-09,US_AK,0.0,0.0,0.0,0.0,1.0,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,Pacific,2020,3,9
4,2020-03-10,US_AK,0.0,0.0,0.0,0.0,0.0,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,Pacific,2020,3,10


In [16]:
y_data = df[['date', 'region', 'location_key', 'new_confirmed']]
x_data = df.drop(columns = ['date', 'new_confirmed'])
xddum = pd.get_dummies(x_data)
scaler = StandardScaler()
scaledx = scaler.fit_transform(xddum)

In [17]:
datanum = xddum.copy()
mice = IterativeImputer(max_iter = 50, random_state = 0)
imputedvals = mice.fit_transform(datanum)
imputed_datanum1 = pd.DataFrame(imputedvals, columns = xddum.columns)

In [18]:
datanum = scaledx.copy()
knn = KNNImputer(n_neighbors = 10)
imputedvals2 = knn.fit_transform(datanum)
knn_datanum = pd.DataFrame(imputedvals2, columns = xddum.columns)

In [22]:
xdatanew = pd.DataFrame(scaler.inverse_transform(knn_datanum), columns = xddum.columns)
xdata_other = xdatanew.drop(columns = xdatanew.columns[-63:])
xdata_xg = xdatanew.drop(columns = xdatanew.columns[-9:])
data_all = pd.concat([y_data, xdata_other], axis = 1)
data_xg = pd.concat([y_data, xdata_xg], axis = 1)

In [41]:
# Multivariate
enc_data_multi = data_xg[data_xg['region'] == "East North Central"]
esc_data_multi = data_xg[data_xg['region'] == "East South Central"]
mid_atlantic_data_multi = data_xg[data_xg['region'] == "Mid-Atlantic"]
mountain_data_multi = data_xg[data_xg['region'] == "Mountain"]
new_england_data_multi = data_xg[data_xg['region'] == "New England"]
pacific_data_multi = data_xg[data_xg['region'] == "Pacific"]
south_atlantic_data_multi = data_xg[data_xg['region'] == "South Atlantic"]
wnc_data_multi = data_xg[data_xg['region'] == "West North Central"]
wsc_data_multi = data_xg[data_xg['region'] == "West South Central"]

enc_data_multi = enc_data_multi.loc[:, ~(enc_data_multi == 0).all()]
esc_data_multi = esc_data_multi.loc[:, ~(esc_data_multi == 0).all()]
mid_atlantic_data_multi = mid_atlantic_data_multi.loc[:, ~(mid_atlantic_data_multi == 0).all()]
mountain_data_multi = mountain_data_multi.loc[:, ~(mountain_data_multi == 0).all()]
new_england_data_multi = new_england_data_multi.loc[:, ~(new_england_data_multi == 0).all()]
pacific_data_multi = pacific_data_multi.loc[:, ~(pacific_data_multi == 0).all()]
south_atlantic_data_multi = south_atlantic_data_multi.loc[:, ~(south_atlantic_data_multi == 0).all()]
wnc_data_multi = wnc_data_multi.loc[:, ~(wnc_data_multi == 0).all()]
wsc_data_multi = wsc_data_multi.loc[:, ~(wsc_data_multi == 0).all()]


In [43]:
pacific_lag = [1, 3, 7, 10]
enc_lag = [1, 2, 7]
esc_lag = [1, 2, 3, 7]
mid_atlantic_lag = [1, 2, 8, 9]
mountain_lag = [1, 6, 7]
new_england_lag = [1, 7]
south_atlantic_lag = [1, 6, 7]
wnc_lag = [1, 6, 7]
wsc_lag = [1, 7]

for i in pacific_lag:
    pacific_data_multi[str(i) + "_day_shift"] = pacific_data_multi['new_confirmed'].shift(i)
    pacific_data_multi["7_day_avg"] = pacific_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in enc_lag:
    enc_data_multi[str(i) + "_day_shift"] = enc_data_multi['new_confirmed'].shift(i)
    enc_data_multi["7_day_avg"] = enc_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in esc_lag:
    esc_data_multi[str(i) + "_day_shift"] = esc_data_multi['new_confirmed'].shift(i)
    esc_data_multi["7_day_avg"] = esc_data_multi['new_confirmed'].rolling(window= 7).mean()


for i in mid_atlantic_lag:
    mid_atlantic_data_multi[str(i) + "_day_shift"] = mid_atlantic_data_multi['new_confirmed'].shift(i)
    mid_atlantic_data_multi["7_day_avg"] = mid_atlantic_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in mountain_lag:
    mountain_data_multi[str(i) + "_day_shift"] = mountain_data_multi['new_confirmed'].shift(i)
    mountain_data_multi["7_day_avg"] = mountain_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in new_england_lag:
    new_england_data_multi[str(i) + "_day_shift"] = new_england_data_multi['new_confirmed'].shift(i)
    new_england_data_multi["7_day_avg"] = new_england_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in south_atlantic_lag:
    south_atlantic_data_multi[str(i) + "_day_shift"] = south_atlantic_data_multi['new_confirmed'].shift(i)
    south_atlantic_data_multi["7_day_avg"] = south_atlantic_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in wnc_lag:
    wnc_data_multi[str(i) + "_day_shift"] = wnc_data_multi['new_confirmed'].shift(i)
    wnc_data_multi["7_day_avg"] = wnc_data_multi['new_confirmed'].rolling(window= 7).mean()

for i in wsc_lag:
    wsc_data_multi[str(i) + "_day_shift"] = wsc_data_multi['new_confirmed'].shift(i)
    wsc_data_multi["7_day_avg"] = wsc_data_multi['new_confirmed'].rolling(window= 7).mean()

In [45]:
pacific_data_multi.to_csv('../data/regional_datasets/xgboost/pacific_data_multi.csv')
enc_data_multi.to_csv('../data/regional_datasets/xgboost/enc_data_multi.csv')
esc_data_multi.to_csv('../data/regional_datasets/xgboost/esc_data_multi.csv')
mid_atlantic_data_multi.to_csv('../data/regional_datasets/xgboost/mid_atlantic_data_multi.csv')
mountain_data_multi.to_csv('../data/regional_datasets/xgboost/mountain_data_multi.csv')
new_england_data_multi.to_csv('../data/regional_datasets/xgboost/new_england_data_multi.csv')
south_atlantic_data_multi.to_csv('../data/regional_datasets/xgboost/south_atlantic_data_multi.csv')
wnc_data_multi.to_csv('../data/regional_datasets/xgboost/wnc_data_multi.csv')
wsc_data_multi.to_csv('../data/regional_datasets/xgboost/wsc_data_multi.csv')
