In [1]:
import os

import pandas as pd
import numpy as np

from dotenv import load_dotenv
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from sklearn.preprocessing import Normalizer
from sklearn.metrics import mean_absolute_error, median_absolute_error

# Grab covid data

In [2]:
dotenv_path = os.path.join(
    os.path.dirname(os.path.abspath('.')),
    '.env'
)
load_dotenv(dotenv_path, verbose=True)
conn_string = os.getenv('DATABASE_URL')
engine = create_engine(conn_string)

In [3]:
%load_ext sql

%sql $conn_string

### Grab all covid state data

In [4]:
sql = """
SELECT f.date_id, f.location_id, cases, recoveries, deaths, 
    cases_100k, testing_rate, hospitalization_rate,
    date, year, month, day_of_week, day_of_month,
    country, state, city, latitude, longitude, population
FROM covid_facts f JOIN date_dim d ON d.date_id = f.date_id
JOIN location_dim l ON l.location_id = f.location_id
WHERE country = 'US' AND city IS NULL
ORDER BY state
"""

us_df = pd.read_sql(sql, engine)

### Remove rows with null population

In [5]:
us_df = us_df.loc[pd.notnull(us_df.population)]

# Add region information for each state

In [6]:
states_df = pd.read_csv('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv')
states_df = states_df.rename(columns=lambda col: col.lower())

In [8]:
us2_df = us_df.join(states_df.set_index('state'), on='state').sort_values(['state', 'date'])

In [14]:
us_northeast_df = us2_df[us2_df.region == 'Northeast']
us_south_df = us2_df[us2_df.region == 'South']
us_midwest_df = us2_df[us2_df.region == 'Midwest']
us_west_df = us2_df[us2_df.region == 'West']

In [15]:
us_northeast_df['cases_norm100k'] = us_northeast_df.cases / (us_northeast_df.population / 100_000)
us_northeast_df['recoveries_norm100k'] = us_northeast_df.recoveries / (us_northeast_df.population / 100_000)
us_northeast_df['deaths_norm100k'] = us_northeast_df.deaths / (us_northeast_df.population / 100_000)
us_south_df['cases_norm100k'] = us_south_df.cases / (us_south_df.population / 100_000)
us_south_df['recoveries_norm100k'] = us_south_df.recoveries / (us_south_df.population / 100_000)
us_south_df['deaths_norm100k'] = us_south_df.deaths / (us_south_df.population / 100_000)
us_midwest_df['cases_norm100k'] = us_midwest_df.cases / (us_midwest_df.population / 100_000)
us_midwest_df['recoveries_norm100k'] = us_midwest_df.recoveries / (us_midwest_df.population / 100_000)
us_midwest_df['deaths_norm100k'] = us_midwest_df.deaths / (us_midwest_df.population / 100_000)
us_west_df['cases_norm100k'] = us_west_df.cases / (us_west_df.population / 100_000)
us_west_df['recoveries_norm100k'] = us_west_df.recoveries / (us_west_df.population / 100_000)
us_west_df['deaths_norm100k'] = us_west_df.deaths / (us_west_df.population / 100_000)

In [79]:
us3_df = us_northeast_df.append(us_south_df.append(us_midwest_df).append(us_west_df))
us3_df = us3_df.drop(['year','month', 'country', 'state code', 'cases_100k', 'day_of_week', 'region', 'hospitalization_rate', 'longitude', 'division', 'location_id', 'day_of_month', 'city', 'latitude'], axis = 1)
us3_df.head()

Unnamed: 0,date_id,cases,recoveries,deaths,testing_rate,date,state,population,cases_norm100k,recoveries_norm100k,deaths_norm100k
1933,49,2,0.0,0.0,,2020-03-10,Connecticut,3565287.0,0.056096,0.0,0.0
1858,50,3,0.0,0.0,,2020-03-11,Connecticut,3565287.0,0.084145,0.0,0.0
1843,51,5,0.0,0.0,,2020-03-12,Connecticut,3565287.0,0.140241,0.0,0.0
1912,52,11,0.0,0.0,,2020-03-13,Connecticut,3565287.0,0.308531,0.0,0.0
1993,53,22,0.0,0.0,,2020-03-14,Connecticut,3565287.0,0.617061,0.0,0.0


In [155]:
def sliding_window(df, segment_time_frame, days_out):
    X = pd.DataFrame()
    y = []
    loop_count = 0
    groups = df.groupby('state')
    for name, group in groups:
        print(name)
        data_df = pd.DataFrame()
        for index, row in group.sort_values('date').drop(['state','date'], axis = 1).iterrows():
            loop_count = loop_count + 1
            data_df = data_df.append(row)
            if data_df.shape[0] >= segment_time_frame:
                #Calculate Features
                features = {}
                features['state'] = name
                features['date'] = group.date[index]
                for column in data_df.columns:
                    if column not in ['state', 'date']:
                        features['Max_' + column] = data_df[column].max()
                        features['Min_' + column] = data_df[column].min()
                        features['AVG_' + column] = data_df[column].mean()
                        if segment_time_frame > 1:
                            features[column + '_2'] = data_df[column].values[-1] - data_df[column].values[-2]
                            if segment_time_frame >= 3:
                                features[column + '_3'] = data_df[column].values[-1] - data_df[column].values[-3]
                                if segment_time_frame >= 5:
                                    features[column + '_5'] = data_df[column].values[-1] - data_df[column].values[-5]
                                    if segment_time_frame >= 7:
                                        features[column + '_7'] = data_df[column].values[-1] - data_df[column].values[-7]
                                        if segment_time_frame >= 8:
                                            features[column + '_' + str(segment_time_frame)] = data_df[column].values[-1] - data_df[column].values[-segment_time_frame]
                #Append Features
                X = X.append(features, ignore_index = True)
                data_df = data_df.iloc[1:,:]
                try:
                    y.append(group.sort_values('date').cases[index + days_out])
                except:
                    y.append(-1)
    return X.iloc[:, :], y[:]

In [119]:
X, y = sliding_window(us3_df, 15, 7)
X.head()
y[:5]

Alabama
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
District of Columbia
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
New Mexico
New York
North Carolina
North Dakota
Ohio
Oklahoma
Oregon
Pennsylvania
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
Utah
Vermont
Virginia
Washington
West Virginia
Wisconsin
Wyoming


[158717, 4041, 230708, 13186, 173002]

In [120]:
for column in X.columns:
    if X[column].isnull().values.all():
        print(column)


In [121]:
#X2 = X2.drop([''], axis = 1)
X2 = X.fillna(X.mean())
X2['Target'] = y
X_val, X2 = X2[X2['Target'] == -1], X2[X2['Target'] != -1]
y_val, X_val, y2, X2 = X_val['Target'], X_val.drop('Target', axis = 1), X2['Target'], X2.drop('Target', axis = 1), 

In [140]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = .25)
print([column for column in X_train.columns if column not in ['date', 'state']])
transformed_X_train = pd.DataFrame(Normalizer().fit_transform(X_train[[column for column in X_train.columns if column not in ['date', 'state']]]), columns = [column for column in X_train.columns if column not in ['date', 'state']]) + 1
transformed_X_test = pd.DataFrame(Normalizer().transform(X_test[[column for column in X_test.columns if column not in ['date', 'state']]]), columns = [column for column in X_train.columns if column not in ['date', 'state']]) + 1

selected_col_idx = SelectKBest(chi2, k = 15).fit(transformed_X_train, y_train).get_support(indices = True)
selected_X_train = transformed_X_train.iloc[:, selected_col_idx]
selected_X_test = transformed_X_test.iloc[:, selected_col_idx]

['AVG_cases', 'AVG_cases_norm100k', 'AVG_date_id', 'AVG_deaths', 'AVG_deaths_norm100k', 'AVG_population', 'AVG_recoveries', 'AVG_recoveries_norm100k', 'AVG_testing_rate', 'Max_cases', 'Max_cases_norm100k', 'Max_date_id', 'Max_deaths', 'Max_deaths_norm100k', 'Max_population', 'Max_recoveries', 'Max_recoveries_norm100k', 'Max_testing_rate', 'Min_cases', 'Min_cases_norm100k', 'Min_date_id', 'Min_deaths', 'Min_deaths_norm100k', 'Min_population', 'Min_recoveries', 'Min_recoveries_norm100k', 'Min_testing_rate', 'cases_15', 'cases_2', 'cases_3', 'cases_5', 'cases_7', 'cases_norm100k_15', 'cases_norm100k_2', 'cases_norm100k_3', 'cases_norm100k_5', 'cases_norm100k_7', 'date_id_15', 'date_id_2', 'date_id_3', 'date_id_5', 'date_id_7', 'deaths_15', 'deaths_2', 'deaths_3', 'deaths_5', 'deaths_7', 'deaths_norm100k_15', 'deaths_norm100k_2', 'deaths_norm100k_3', 'deaths_norm100k_5', 'deaths_norm100k_7', 'population_15', 'population_2', 'population_3', 'population_5', 'population_7', 'recoveries_15', '

In [143]:
new_selected_X_train = selected_X_train.drop(['testing_rate_3', 'testing_rate_5', 'testing_rate_7', 'Min_testing_rate', 'Max_cases', 'Min_cases', 'Max_recoveries', 'Min_recoveries', 'AVG_testing_rate', 'Max_testing_rate'], axis = 1)
new_selected_X_test = selected_X_test.drop(['testing_rate_3', 'testing_rate_5', 'testing_rate_7', 'Min_testing_rate', 'Max_cases', 'Min_cases', 'Max_recoveries', 'Min_recoveries', 'AVG_testing_rate', 'Max_testing_rate'], axis = 1)
corr_matrix = new_selected_X_train.corr('spearman')
for row_idx, row in corr_matrix.iterrows():
    for col_idx, cor in row.iteritems():
        if col_idx != row_idx and cor > .8:
            print(str(row_idx) + ' ' + str(col_idx) + ' = ' + str(corr_matrix.loc[row_idx, col_idx]))
print(new_selected_X_train.columns)

Index(['AVG_cases', 'AVG_recoveries', 'cases_15', 'recoveries_15',
       'testing_rate_15'],
      dtype='object')


In [144]:
reg = linear_model.LinearRegression().fit(new_selected_X_train, y_train)
coef = {}
for idx, name in enumerate(new_selected_X_train.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'AVG_cases': 7374229.928063626, 'AVG_recoveries': -6648077.405275628, 'cases_15': -9888702.410170933, 'recoveries_15': 2084840.7868394887, 'testing_rate_15': -9020275.513910199}


In [145]:
y_pred = reg.predict(new_selected_X_test)
print("The Explained Variance: %.2f" % reg.score(new_selected_X_test, y_test))
print("The Mean Absolute Error: %.2f cases" % mean_absolute_error(y_test, y_pred))
print("The Median Absolute Error: %.2f cases" % median_absolute_error(y_test, y_pred))

The Explained Variance: 0.07
The Mean Absolute Error: 100090.61 cases
The Median Absolute Error: 71785.00 cases


In [146]:
X3 = new_selected_X_train.append(new_selected_X_test)
y3 = list(y_train) + list(y_test)

In [147]:
reg2 = linear_model.LinearRegression().fit(X3, y3)
coef = {}
for idx, name in enumerate(X3.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'AVG_cases': 7374229.928063626, 'AVG_recoveries': -6648077.405275628, 'cases_15': -9888702.410170933, 'recoveries_15': 2084840.7868394887, 'testing_rate_15': -9020275.513910199}


In [154]:
final_df = X3.copy()
final_df['Target'] = y3
final_df['state'] = list(X_train['state'].values) + list(X_test['state'].values)
final_df['date'] = list(X_train['date'].values) + list(X_test['date'].values)
final_df = final_df.sort_values(by = ['state', 'date'])
final_df.head(22)

Unnamed: 0,AVG_cases,AVG_recoveries,cases_15,recoveries_15,testing_rate_15,Target,state,date
4755,1.000199,1.0,1.000536,1.000834,1.000464,158717,Alabama,2020-04-17
5474,1.000236,1.0,1.000563,1.000834,1.000464,4041,Alabama,2020-04-18
160,1.000275,1.0,1.000585,1.000834,1.000464,230708,Alabama,2020-04-19
3212,1.000316,1.0,1.000605,1.000834,1.000464,13186,Alabama,2020-04-20
6317,1.000358,1.0,1.000632,1.000834,1.000464,173002,Alabama,2020-04-21
1636,1.000402,1.0,1.000665,1.000834,1.000464,8203,Alabama,2020-04-22
8102,1.000448,1.0,1.000685,1.000834,1.000464,190113,Alabama,2020-04-23
2113,1.000496,1.0,1.000708,1.000834,1.000464,138619,Alabama,2020-04-24
7380,1.000544,1.0,1.000723,1.000834,1.000464,7700,Alabama,2020-04-25
2005,1.000594,1.004744,1.000331,1.000834,1.000131,168620,Alabama,2020-04-26
