In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from division_aggregation_function import division_aggregation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler

In [4]:
data = pd.read_csv("index.csv")
hosp = pd.read_csv("hospitalizations.csv")
epi = pd.read_csv("epidemiology.csv")
vax = pd.read_csv("vaccinations.csv")
emrg = pd.read_csv("lawatlas-emergency-declarations.csv")
weather = pd.read_csv('weather.csv')
govresp = pd.read_csv('oxford-government-response.csv')

In [6]:
epi_us = epi[epi.location_key.isin(data[data.country_name == 'United States of America'].location_key)]
hosp_us = hosp[hosp.location_key.isin(data[data.country_name == 'United States of America'].location_key)]
vax_us = vax[vax.location_key.isin(data[data.country_name == 'United States of America'].location_key)]
emrg_us = emrg[emrg.location_key.isin(data[data.country_name == 'United States of America'].location_key)]
weather_us = weather[weather.location_key.isin(data[data.country_name == 'United States of America'].location_key)]
govresp_us = govresp[govresp.location_key.isin(data[data.country_name == 'United States of America'].location_key)]

In [7]:
terrs = ['US', 'AS', 'GU', 'MP', 'PR', 'VI']
epi_f = epi_us[(epi_us.location_key.str.len() <= 5) & ~(epi_us.location_key.isin(terrs))]
hosp_f = hosp_us[(hosp_us.location_key.str.len() <= 5) & ~(hosp_us.location_key.isin(terrs))]
vax_f = vax_us[(vax_us.location_key.str.len() <= 5) & ~(vax_us.location_key.isin(terrs))]
emrg_f = emrg_us[(emrg_us.location_key.str.len() <= 5) & ~(emrg_us.location_key.isin(terrs))]
weather_f = weather_us[(weather_us.location_key.str.len() <= 5) & ~(weather_us.location_key.isin(terrs))]
govresp_f = govresp_us[(govresp_us.location_key.str.len() <= 5) & ~(govresp_us.location_key.isin(terrs))]

In [28]:
data1 = pd.merge(epi_f, hosp_f, on=['date', 'location_key']).merge(weather_f, on=['date', 'location_key']).merge(govresp_f, on=['date', 'location_key'])
vax_f = vax_f[vax_f.columns[(vax_f.isna().sum()/len(vax_f) < 0.8).values]]
data1 = pd.merge(data1, vax_f, on = ['date', 'location_key'], how = 'left')

In [None]:
data1.loc[data1['date'] < min(vax_f.date), ['new_persons_vaccinated',
       'cumulative_persons_vaccinated', 'new_persons_fully_vaccinated',
       'cumulative_persons_fully_vaccinated', 'new_vaccine_doses_administered',
       'cumulative_vaccine_doses_administered',
       'new_persons_fully_vaccinated_pfizer',
       'cumulative_persons_fully_vaccinated_pfizer',
       'new_vaccine_doses_administered_pfizer',
       'cumulative_vaccine_doses_administered_pfizer',
       'new_persons_fully_vaccinated_moderna',
       'cumulative_persons_fully_vaccinated_moderna',
       'new_vaccine_doses_administered_moderna',
       'cumulative_vaccine_doses_administered_moderna',
       'new_persons_fully_vaccinated_janssen',
       'cumulative_persons_fully_vaccinated_janssen',
       'new_vaccine_doses_administered_janssen',
       'cumulative_vaccine_doses_administered_janssen']] = 0

In [None]:
data_all = data1[data1.columns[(data1.isna().sum()/len(data1) < 0.5).values]]

In [None]:
data_all.to_csv('covid_data1.csv')

In [156]:
data_all = pd.read_csv('/Users/RJS/Desktop/STAT 390/Stat390-Project/data/covid_data1.csv').drop(columns = 'Unnamed: 0')
data_all['region'] = data_all['location_key'].apply(division_aggregation)
data_all = data_all[data_all.columns[(data_all.isna().sum()/len(data_all) <= 0.1).values]]
data_all['date'] = pd.to_datetime(data_all.date)
data_all['year'] = data_all.date.dt.year
data_all['month'] = data_all.date.dt.month
data_all['day'] = data_all.date.dt.day

In [157]:
split_values = data_all.groupby('region').apply(lambda x: int(len(x) * 0.85))
train = pd.DataFrame()
test = pd.DataFrame()
for region, split in split_values.items():
    region_data = data_all[data_all.region == region].sort_values(by = ['date', 'location_key'])
    train = pd.concat([train, region_data.iloc[:split]])
    test = pd.concat([test, region_data.iloc[split:]])
train = train.reset_index(drop = True).sort_values(by = ['date', 'region', 'location_key'])
test = test.reset_index(drop = True).sort_values(by = ['date', 'region', 'location_key'])

In [158]:
y_data1 = train[['date', 'region', 'location_key', 'new_confirmed']]
x_data1 = train.drop(columns = ['date', 'new_confirmed'])
xddum = pd.get_dummies(x_data1)
scaler = StandardScaler()
scaledtrain = scaler.fit_transform(xddum)
datatrain = scaledtrain.copy()
knn = KNNImputer(n_neighbors = 25)
imputedvals = knn.fit_transform(datatrain)
knn_datatrain = pd.DataFrame(imputedvals, columns = xddum.columns)
xdatatrain = pd.DataFrame(scaler.inverse_transform(knn_datatrain), columns = xddum.columns)

In [159]:
y_data2 = test[['date', 'region', 'location_key', 'new_confirmed']]
x_data2 = test.drop(columns = ['date', 'new_confirmed'])
xddum = pd.get_dummies(x_data2)
scaler2 = StandardScaler()
scaledtest = scaler2.fit_transform(xddum)
datatest = scaledtest.copy()
knn = KNNImputer(n_neighbors = 10)
imputedvals2 = knn.fit_transform(datatest)
knn_datatest = pd.DataFrame(imputedvals2, columns = xddum.columns)
xdatatest = pd.DataFrame(scaler2.inverse_transform(knn_datatest), columns = xddum.columns)

In [160]:
xdatanew = pd.concat([xdatatrain, xdatatest], axis = 0).reset_index().drop(columns = 'index')
y_data = pd.concat([y_data1, y_data2], axis = 0).reset_index().drop(columns = 'index')
xdata_other = xdatanew.drop(columns = xdatanew.columns[-63:])
xdata_xg = xdatanew.drop(columns = xdatanew.columns[-9:])
data_all = pd.concat([y_data, xdata_other], axis = 1)
data_xg = pd.concat([y_data, xdata_xg], axis = 1)

In [161]:
numofstates = {
    "New England": 6,
    "Mid-Atlantic": 3,
    "East North Central": 5,
    "West North Central": 7,
    "South Atlantic": 9,
    "East South Central": 4,
    "West South Central": 4,
    "Mountain": 8,
    "Pacific": 5
}

In [162]:
def groupfilter(group):
    region = group.name[0]
    numstates = numofstates[region]
    return group['location_key'].nunique() == numstates

In [163]:
filtered_data = data_all.groupby(['region', 'date']).filter(groupfilter)

In [164]:
filtered_data.groupby(['region', 'date']).size()

region              date      
East North Central  2020-03-05    5
                    2020-03-06    5
                    2020-03-07    5
                    2020-03-08    5
                    2020-03-09    5
                                 ..
West South Central  2022-06-30    4
                    2022-07-01    4
                    2022-07-02    4
                    2022-07-03    4
                    2022-07-04    4
Length: 7654, dtype: int64

In [165]:
grouped = filtered_data.groupby(['region', 'date'])

In [166]:
regional_multi = grouped.agg({'new_confirmed': 'sum', 'new_deceased': 'sum',
       'cumulative_confirmed': 'sum', 'cumulative_deceased': 'sum', 
       'new_hospitalized_patients': 'sum', 'cumulative_hospitalized_patients': 'sum',
       'current_hospitalized_patients': 'sum', 'current_intensive_care_patients': 'sum',
       'new_persons_vaccinated': 'sum', 'cumulative_persons_vaccinated': 'sum',
       'new_persons_fully_vaccinated': 'sum', 'cumulative_persons_fully_vaccinated': 'sum',
       'new_vaccine_doses_administered': 'sum',
       'cumulative_vaccine_doses_administered': 'sum',
       'new_persons_fully_vaccinated_pfizer': 'sum',
       'cumulative_persons_fully_vaccinated_pfizer': 'sum',
       'new_vaccine_doses_administered_pfizer': 'sum',
       'cumulative_vaccine_doses_administered_pfizer': 'sum',
       'new_persons_fully_vaccinated_moderna': 'sum',
       'cumulative_persons_fully_vaccinated_moderna': 'sum',
       'new_vaccine_doses_administered_moderna': 'sum',
       'cumulative_vaccine_doses_administered_moderna': 'sum',
       'new_persons_fully_vaccinated_janssen': 'sum',
       'cumulative_persons_fully_vaccinated_janssen': 'sum',
       'new_vaccine_doses_administered_janssen': 'sum',
       'cumulative_vaccine_doses_administered_janssen': 'sum',
       'rainfall_mm': 'sum', 
       'average_temperature_celsius': 'mean', 'dew_point': 'mean',
       'relative_humidity': 'mean', 'school_closing': 'mean', 'workplace_closing': 'mean',
       'cancel_public_events': 'mean', 'restrictions_on_gatherings': 'mean',
       'public_transport_closing': 'mean', 'stay_at_home_requirements': 'mean',
       'restrictions_on_internal_movement': 'mean', 'international_travel_controls': 'mean',
       'income_support': 'mean', 'debt_relief': 'mean', 'public_information_campaigns': 'mean',
       'testing_policy': 'mean', 'contact_tracing': 'mean', 'facial_coverings': 'mean',
       'vaccination_policy': 'mean', 'stringency_index': 'mean', 
       'minimum_temperature_celsius': 'min', 
       'maximum_temperature_celsius': 'max'})

In [167]:
regional_multi.to_csv('../data/regional_multivariate_data.csv')

In [168]:
data_xg.to_csv('../data/regional_xgboost_data.csv')