In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from division_aggregation_function import division_aggregation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

In [12]:
df = pd.read_csv("../data/covid_data1.csv", index_col=0)
df['region'] = df['location_key'].apply(division_aggregation)
df = df[df.columns[(df.isna().sum()/len(df) <= 0.1).values]]
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day

In [13]:
y_data = df[['date', 'region', 'location_key', 'new_confirmed']]
x_data = df.drop(columns = ['date', 'new_confirmed'])
xddum = pd.get_dummies(x_data)
scaler = StandardScaler()
scaledx = scaler.fit_transform(xddum)

In [14]:
datanum = scaledx.copy()
knn = KNNImputer(n_neighbors = 10)
imputedvals2 = knn.fit_transform(datanum)
knn_datanum = pd.DataFrame(imputedvals2, columns = xddum.columns)

In [8]:
xdatanew = pd.DataFrame(scaler.inverse_transform(knn_datanum), columns = xddum.columns)
xdata_other = xdatanew.drop(columns = xdatanew.columns[-63:])
xdata_xg = xdatanew.drop(columns = xdatanew.columns[-9:])
data_all = pd.concat([y_data, xdata_other], axis = 1)
data_xg = pd.concat([y_data, xdata_xg], axis = 1)

In [36]:
def round_columns(df):
    for col in df.columns:
        if col.startswith('location_key_'):
            df[col] = df[col].round().astype(int)
    return df

In [43]:
# Multivariate
enc_data_multi = data_xg[data_xg['region'] == "East North Central"]
esc_data_multi = data_xg[data_xg['region'] == "East South Central"]
mid_atlantic_data_multi = data_xg[data_xg['region'] == "Mid-Atlantic"]
mountain_data_multi = data_xg[data_xg['region'] == "Mountain"]
new_england_data_multi = data_xg[data_xg['region'] == "New England"]
pacific_data_multi = data_xg[data_xg['region'] == "Pacific"]
south_atlantic_data_multi = data_xg[data_xg['region'] == "South Atlantic"]
wnc_data_multi = data_xg[data_xg['region'] == "West North Central"]
wsc_data_multi = data_xg[data_xg['region'] == "West South Central"]

# Round location_key columns
enc_data_multi = round_columns(enc_data_multi)
esc_data_multi = round_columns(esc_data_multi)
mid_atlantic_data_multi = round_columns(mid_atlantic_data_multi)
mountain_data_multi = round_columns(mountain_data_multi)
new_england_data_multi = round_columns(new_england_data_multi)
pacific_data_multi = round_columns(pacific_data_multi)
south_atlantic_data_multi = round_columns(south_atlantic_data_multi)
wnc_data_multi = round_columns(wnc_data_multi)
wsc_data_multi = round_columns(wsc_data_multi)

# Drop columns with all zeros i.e. empty state columns
enc_data_multi = enc_data_multi.loc[:, ~(enc_data_multi == 0).all()]
esc_data_multi = esc_data_multi.loc[:, ~(esc_data_multi == 0).all()]
mid_atlantic_data_multi = mid_atlantic_data_multi.loc[:, ~(mid_atlantic_data_multi == 0).all()]
mountain_data_multi = mountain_data_multi.loc[:, ~(mountain_data_multi == 0).all()]
new_england_data_multi = new_england_data_multi.loc[:, ~(new_england_data_multi == 0).all()]
pacific_data_multi = pacific_data_multi.loc[:, ~(pacific_data_multi == 0).all()]
south_atlantic_data_multi = south_atlantic_data_multi.loc[:, ~(south_atlantic_data_multi == 0).all()]
wnc_data_multi = wnc_data_multi.loc[:, ~(wnc_data_multi == 0).all()]
wsc_data_multi = wsc_data_multi.loc[:, ~(wsc_data_multi == 0).all()]


In [44]:
def apply_transformations(df):
    all_lags = {"Pacific": [1, 3, 7, 10], "East North Central": [1, 2, 7], 
                "East South Central": [1, 2, 3, 7], "Mid-Atlantic": [1, 2, 8, 9], 
                "Mountain": [1, 6, 7], "New England": [1, 7], "South Atlantic": [1, 6, 7], 
                "West North Central": [1, 6, 7], "West South Central": [1, 7]}

    region = df.region.unique()[0]
    lags = all_lags.get(region, [])
    # Create lagged features
    for i in lags:
        df[str(i) + "_day_shift"] = df['new_confirmed'].shift(i)
    
    # Compute 7-day moving average
    df["7_day_avg"] = df['new_confirmed'].rolling(window=7).mean()
    return df

pacific_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
pacific_data_multi = pacific_data_multi.groupby('location_key').apply(apply_transformations)

enc_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
enc_data_multi = enc_data_multi.groupby('location_key').apply(apply_transformations)

esc_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
esc_data_multi = esc_data_multi.groupby('location_key').apply(apply_transformations)

mid_atlantic_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
mid_atlantic_data_multi = mid_atlantic_data_multi.groupby('location_key').apply(apply_transformations)

mountain_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
mountain_data_multi = mountain_data_multi.groupby('location_key').apply(apply_transformations)

new_england_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
new_england_data_multi = new_england_data_multi.groupby('location_key').apply(apply_transformations)

south_atlantic_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
south_atlantic_data_multi = south_atlantic_data_multi.groupby('location_key').apply(apply_transformations)

wnc_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
wnc_data_multi = wnc_data_multi.groupby('location_key').apply(apply_transformations)

wsc_data_multi.sort_values(by=['location_key', 'date'], inplace=True)
wsc_data_multi = wsc_data_multi.groupby('location_key').apply(apply_transformations)

In [45]:
pacific_data_multi.to_csv('../data/regional_datasets/xgboost/pacific_data_multi.csv')
enc_data_multi.to_csv('../data/regional_datasets/xgboost/enc_data_multi.csv')
esc_data_multi.to_csv('../data/regional_datasets/xgboost/esc_data_multi.csv')
mid_atlantic_data_multi.to_csv('../data/regional_datasets/xgboost/mid_atlantic_data_multi.csv')
mountain_data_multi.to_csv('../data/regional_datasets/xgboost/mountain_data_multi.csv')
new_england_data_multi.to_csv('../data/regional_datasets/xgboost/new_england_data_multi.csv')
south_atlantic_data_multi.to_csv('../data/regional_datasets/xgboost/south_atlantic_data_multi.csv')
wnc_data_multi.to_csv('../data/regional_datasets/xgboost/wnc_data_multi.csv')
wsc_data_multi.to_csv('../data/regional_datasets/xgboost/wsc_data_multi.csv')
