In [1]:
import os

import pandas as pd
import numpy as np

from dotenv import load_dotenv
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from sklearn.preprocessing import Normalizer
from sklearn.metrics import mean_absolute_error, median_absolute_error

In [2]:
dotenv_path = os.path.join(
    os.path.dirname(os.path.abspath('.')),
    '.env'
)
load_dotenv(dotenv_path, verbose=True)
conn_string = os.getenv('DATABASE_URL')
engine = create_engine(conn_string)

In [3]:
%load_ext sql

%sql $conn_string

In [4]:
sql = """
SELECT f.date_id, f.location_id, cases, recoveries, deaths, 
    cases_100k, testing_rate, hospitalization_rate,
    date, year, month, day_of_week, day_of_month,
    country, state, city, latitude, longitude, population
FROM covid_facts f JOIN date_dim d ON d.date_id = f.date_id
JOIN location_dim l ON l.location_id = f.location_id
WHERE country = 'US' AND city IS NULL
ORDER BY state
"""

us_df = pd.read_sql(sql, engine)

In [5]:
print(us_df.shape)
us_df.head()

(15161, 19)


Unnamed: 0,date_id,location_id,cases,recoveries,deaths,cases_100k,testing_rate,hospitalization_rate,date,year,month,day_of_week,day_of_month,country,state,city,latitude,longitude,population
0,314,84000001,249524,161946.0,3578.0,5089.018668,32312.609049,,2020-11-30,2020,11,0,30,US,Alabama,,32.3182,-86.9023,4903185.0
1,156,84000001,33880,18866.0,896.0,677.233268,7535.061394,7.866048,2020-06-25,2020,6,3,25,US,Alabama,,32.3182,-86.9023,4903185.0
2,87,84000001,4557,,148.0,97.485613,807.183438,12.994968,2020-04-17,2020,4,4,17,US,Alabama,,32.3182,-86.9023,4903185.0
3,206,84000001,106096,41523.0,1893.0,2168.162123,16825.471607,11.716788,2020-08-14,2020,8,4,14,US,Alabama,,32.3182,-86.9023,4903185.0
4,270,84000001,172137,74238.0,2788.0,3501.030453,25469.057358,,2020-10-17,2020,10,5,17,US,Alabama,,32.3182,-86.9023,4903185.0


In [6]:
us_df = us_df.loc[pd.notnull(us_df.population)]
us_df.describe()

Unnamed: 0,date_id,location_id,cases,recoveries,deaths,cases_100k,testing_rate,hospitalization_rate,year,month,day_of_week,day_of_month,latitude,longitude,population
count,14347.0,14347.0,14347.0,12042.0,14318.0,14347.0,13608.0,5129.0,14347.0,14347.0,14347.0,14347.0,14347.0,14347.0,14347.0
mean,195.34272,76289190.0,93239.85,39686.85,2761.020813,1436.403103,22595.625605,12.143891,2020.0,7.592319,2.988151,15.803513,36.800569,-84.994608,6023471.0
std,75.854579,24254710.0,162524.4,87585.81,5215.40378,1490.061215,22908.35286,5.245396,0.0,2.508962,1.994206,8.672605,10.796163,49.68496,8981471.0
min,1.0,16.0,0.0,0.0,0.0,0.0,5.391708,1.41844,2020.0,1.0,0.0,1.0,-14.271,-170.132,55144.0
25%,132.0,84000010.0,5501.5,1599.0,122.0,269.530036,5834.146651,8.359942,2020.0,6.0,1.0,8.0,33.8569,-105.3111,1344212.0
50%,196.0,84000030.0,31513.0,8497.0,742.0,1002.899136,16100.979569,11.282093,2020.0,8.0,3.0,16.0,39.0598,-86.9023,3565287.0
75%,260.0,84000040.0,112121.0,42249.5,3010.75,2122.452916,30515.969012,15.268243,2020.0,10.0,5.0,23.0,42.2302,-76.8021,7278717.0
max,324.0,84000060.0,1482551.0,1074579.0,35266.0,11710.0,159983.42397,38.50119,2020.0,12.0,6.0,31.0,61.3707,145.6739,329466300.0


In [7]:
states_df = pd.read_csv('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv')
print(states_df.shape)
states_df.head()

(51, 4)


Unnamed: 0,State,State Code,Region,Division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [8]:
states_df = states_df.rename(columns=lambda col: col.lower())
print(states_df.shape)
states_df.head()

(51, 4)


Unnamed: 0,state,state code,region,division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [9]:
states_df.region.unique()

array(['West', 'South', 'Northeast', 'Midwest'], dtype=object)

In [10]:
print(us_df.shape)
us2_df = us_df.join(states_df.set_index('state'), on='state').sort_values(['state', 'date'])
print(us2_df.shape)
us2_df.head()

(14347, 19)
(14347, 22)


Unnamed: 0,date_id,location_id,cases,recoveries,deaths,cases_100k,testing_rate,hospitalization_rate,date,year,...,day_of_month,country,state,city,latitude,longitude,population,state code,region,division
21,52,84000001,5,0.0,0.0,5.0,,,2020-03-13,2020,...,13,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central
155,53,84000001,6,0.0,0.0,6.0,,,2020-03-14,2020,...,14,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central
152,54,84000001,12,0.0,0.0,12.0,,,2020-03-15,2020,...,15,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central
221,55,84000001,29,0.0,0.0,29.0,,,2020-03-16,2020,...,16,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central
154,56,84000001,39,0.0,0.0,39.0,,,2020-03-17,2020,...,17,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central


In [11]:
us2_df['cases_norm100k'] = us2_df.cases / (us2_df.population / 100_000)
print(us2_df.shape)
us2_df.head()

(14347, 23)


Unnamed: 0,date_id,location_id,cases,recoveries,deaths,cases_100k,testing_rate,hospitalization_rate,date,year,...,country,state,city,latitude,longitude,population,state code,region,division,cases_norm100k
21,52,84000001,5,0.0,0.0,5.0,,,2020-03-13,2020,...,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central,0.101975
155,53,84000001,6,0.0,0.0,6.0,,,2020-03-14,2020,...,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central,0.122369
152,54,84000001,12,0.0,0.0,12.0,,,2020-03-15,2020,...,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central,0.244739
221,55,84000001,29,0.0,0.0,29.0,,,2020-03-16,2020,...,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central,0.591452
154,56,84000001,39,0.0,0.0,39.0,,,2020-03-17,2020,...,US,Alabama,,32.3182,-86.9023,4903185.0,AL,South,East South Central,0.795401


In [12]:
us2_df['recoveries_norm100k'] = us2_df.recoveries / (us2_df.population / 100_000)
print(us2_df.shape)
us2_df.head

(14347, 24)


<bound method NDFrame.head of        date_id  location_id  cases  recoveries  deaths   cases_100k  \
21          52     84000001      5         0.0     0.0     5.000000   
155         53     84000001      6         0.0     0.0     6.000000   
152         54     84000001     12         0.0     0.0    12.000000   
221         55     84000001     29         0.0     0.0    29.000000   
154         56     84000001     39         0.0     0.0    39.000000   
...        ...          ...    ...         ...     ...          ...   
14998      324     84000056  38223     33891.0   299.0  6604.303346   
15158       57          840      1       106.0     0.0     1.000000   
15159       58          840      1       108.0     0.0     1.000000   
15160       59          840      1       147.0     0.0     1.000000   
15157       60          840      1       171.0     0.0     1.000000   

       testing_rate  hospitalization_rate        date  year  ...    state  \
21              NaN                   Na

In [13]:
us2_df['deaths_norm100k'] = us2_df.deaths / (us2_df.population / 100_000)
print(us2_df.shape)
us2_df.head

(14347, 25)


<bound method NDFrame.head of        date_id  location_id  cases  recoveries  deaths   cases_100k  \
21          52     84000001      5         0.0     0.0     5.000000   
155         53     84000001      6         0.0     0.0     6.000000   
152         54     84000001     12         0.0     0.0    12.000000   
221         55     84000001     29         0.0     0.0    29.000000   
154         56     84000001     39         0.0     0.0    39.000000   
...        ...          ...    ...         ...     ...          ...   
14998      324     84000056  38223     33891.0   299.0  6604.303346   
15158       57          840      1       106.0     0.0     1.000000   
15159       58          840      1       108.0     0.0     1.000000   
15160       59          840      1       147.0     0.0     1.000000   
15157       60          840      1       171.0     0.0     1.000000   

       testing_rate  hospitalization_rate        date  year  ...  city  \
21              NaN                   NaN  

In [14]:
us3_df = pd.get_dummies(us2_df, columns = ['region'])
print(us2_df.shape)
print(us3_df.shape)
us3_df = us3_df.drop(['year','month','state code', 'day_of_week', 'longitude', 'division', 'location_id', 'day_of_month', 'city', 'latitude'], axis = 1)
us3_df.columns


(14347, 25)
(14347, 28)


Index(['date_id', 'cases', 'recoveries', 'deaths', 'cases_100k',
       'testing_rate', 'hospitalization_rate', 'date', 'country', 'state',
       'population', 'cases_norm100k', 'recoveries_norm100k',
       'deaths_norm100k', 'region_Midwest', 'region_Northeast', 'region_South',
       'region_West'],
      dtype='object')

# Segment Data

In [15]:
def flatten_df(df, group_fields):
    grouped = df.groupby(group_fields)
    flattened_df = pd.DataFrame()
    for name, group in grouped:
        row = {}
        row['cases'] = group.cases.sum()
        #row['recoveries'] = group.recoveries.sum()
        #row['deaths'] = group.deaths.sum()
        #row['cases_100k'] = group.cases_100k.sum()
        row['testing_rate'] = group.testing_rate.mean()
        #row['hospitalization_rate'] = group.hospitalization_rate.mean()
        #row['date'] = group.date.values[0]
        #row['population'] = group.population.sum()
        row['cases_norm100k'] = group.cases_norm100k.sum()
        row['recoveries_norm100k'] = group.recoveries_norm100k.sum()
        row['deaths_norm100k'] = group.deaths_norm100k.sum()
        for state in group.state.values:
            state_data =  group[group.state == state]
            row[state+'_cases_norm100k'] = state_data.cases_norm100k.values[0]
            row[state+'_recoveries_norm100k'] = state_data.recoveries_norm100k.values[0]
            row[state+'_deaths_norm100k'] = state_data.deaths_norm100k.values[0]
            row[state+'_testing_rate'] = state_data.testing_rate.values[0]
            #row[state+'_hospitalization_rate'] = state_data.cases_norm100k.values[0]
        flattened_df = flattened_df.append(row, ignore_index = True)
    return flattened_df

In [16]:
def sliding_window(df, segment_time_frame, days_out):
    X = pd.DataFrame()
    y = []
    data_df = pd.DataFrame()
    loop_count = 0
    for index, row in df.iterrows():
        loop_count = loop_count + 1
        data_df = data_df.append(row)
        if data_df.shape[0] >= segment_time_frame:
            #Calculate Features
            features = {}
            for column in data_df.columns:
                if column != 'cases':
                    features['Max_' + column] = data_df[column].max()
                    features['Min_' + column] = data_df[column].min()
                    features['AVG_' + column] = data_df[column].mean()
                    if segment_time_frame > 1:
                        features[column + '_2'] = data_df[column].values[-1] - data_df[column].values[-2]
                        if segment_time_frame >= 3:
                            features[column + '_3'] = data_df[column].values[-1] - data_df[column].values[-3]
                            if segment_time_frame >= 5:
                                features[column + '_5'] = data_df[column].values[-1] - data_df[column].values[-5]
                                if segment_time_frame >= 7:
                                    features[column + '_7'] = data_df[column].values[-1] - data_df[column].values[-7]
                                    if segment_time_frame >= 8:
                                        features[column + '_' + str(segment_time_frame)] = data_df[column].values[-1] - data_df[column].values[-segment_time_frame]
            #Append Features
            X = X.append(features, ignore_index = True)
            data_df = data_df.iloc[1:,:]
            try:
                y.append(df.cases[index + days_out])
            except:
                y.append(-1)
                break
    return X.iloc[:-1, :], y[:-1]

In [17]:
us_northeast_df = us3_df[us3_df.region_Northeast == 1].drop(['region_Midwest', 'region_Northeast',  'region_South', 'region_West'], axis = 1)
us_south_df = us3_df[us3_df.region_South== 1].drop(['region_Midwest', 'region_Northeast',  'region_South', 'region_West'], axis = 1)
us_midwest_df = us3_df[us3_df.region_Midwest == 1].drop(['region_Midwest', 'region_Northeast',  'region_South', 'region_West'], axis = 1)
us_west_df = us3_df[us3_df.region_West == 1].drop(['region_Midwest', 'region_Northeast',  'region_South', 'region_West'], axis = 1)

# Northeast Model

In [18]:
northeast_flattened_df = flatten_df(us_northeast_df, ['date'])
print(northeast_flattened_df.columns)
print(northeast_flattened_df.shape)
northeast_flattened_df.head()


Index(['Connecticut_cases_norm100k', 'Connecticut_deaths_norm100k',
       'Connecticut_recoveries_norm100k', 'Connecticut_testing_rate',
       'Maine_cases_norm100k', 'Maine_deaths_norm100k',
       'Maine_recoveries_norm100k', 'Maine_testing_rate',
       'Massachusetts_cases_norm100k', 'Massachusetts_deaths_norm100k',
       'Massachusetts_recoveries_norm100k', 'Massachusetts_testing_rate',
       'New Hampshire_cases_norm100k', 'New Hampshire_deaths_norm100k',
       'New Hampshire_recoveries_norm100k', 'New Hampshire_testing_rate',
       'New Jersey_cases_norm100k', 'New Jersey_deaths_norm100k',
       'New Jersey_recoveries_norm100k', 'New Jersey_testing_rate',
       'New York_cases_norm100k', 'New York_deaths_norm100k',
       'New York_recoveries_norm100k', 'New York_testing_rate',
       'Pennsylvania_cases_norm100k', 'Pennsylvania_deaths_norm100k',
       'Pennsylvania_recoveries_norm100k', 'Pennsylvania_testing_rate',
       'Rhode Island_cases_norm100k', 'Rhode Island_de

Unnamed: 0,Connecticut_cases_norm100k,Connecticut_deaths_norm100k,Connecticut_recoveries_norm100k,Connecticut_testing_rate,Maine_cases_norm100k,Maine_deaths_norm100k,Maine_recoveries_norm100k,Maine_testing_rate,Massachusetts_cases_norm100k,Massachusetts_deaths_norm100k,...,Rhode Island_testing_rate,Vermont_cases_norm100k,Vermont_deaths_norm100k,Vermont_recoveries_norm100k,Vermont_testing_rate,cases,cases_norm100k,deaths_norm100k,recoveries_norm100k,testing_rate
0,0.056096,0.0,0.0,,0.0,0.0,0.0,,1.334784,0.0,...,,0.160259,0.0,0.0,,302.0,3.280419,0.011258,0.014509,
1,0.084145,0.0,0.0,,0.0,0.0,0.0,,1.378309,0.0,...,,0.160259,0.0,0.0,,368.0,3.977245,0.011258,0.014509,
2,0.140241,0.0,0.0,,0.0,0.0,0.0,,1.56692,0.0,...,,0.320518,0.0,0.0,,505.0,5.125343,0.011258,0.014509,
3,0.308531,0.0,0.0,,0.074393,0.0,0.0,,1.784548,0.0,...,,0.320518,0.0,0.0,,648.0,7.061698,0.011258,0.014509,
4,0.617061,0.0,0.0,,0.223179,0.0,0.0,,2.002175,0.0,...,,0.801296,0.0,0.0,,836.0,9.889158,0.021539,0.014509,


In [19]:
northeast_X, northeast_y = sliding_window(northeast_flattened_df, 15, 7)
print(len(northeast_y))
print(northeast_X.shape)
print(northeast_y[:5])
northeast_X = northeast_X.fillna(northeast_X.mean())
print(northeast_X.shape)
northeast_X.head()

234
(234, 320)
[456252.0, 469860.0, 480675.0, 499110.0, 517459.0]
(234, 320)


Unnamed: 0,AVG_Connecticut_cases_norm100k,AVG_Connecticut_deaths_norm100k,AVG_Connecticut_recoveries_norm100k,AVG_Connecticut_testing_rate,AVG_Maine_cases_norm100k,AVG_Maine_deaths_norm100k,AVG_Maine_recoveries_norm100k,AVG_Maine_testing_rate,AVG_Massachusetts_cases_norm100k,AVG_Massachusetts_deaths_norm100k,...,recoveries_norm100k_15,recoveries_norm100k_2,recoveries_norm100k_3,recoveries_norm100k_5,recoveries_norm100k_7,testing_rate_15,testing_rate_2,testing_rate_3,testing_rate_5,testing_rate_7
0,75.141216,3.433104,0.0,1228.232117,11.659867,0.287653,4.121374,890.654165,80.401367,2.365856,...,177.958249,2.148918,12.329957,177.972757,177.972757,5070.487311,73.220709,178.279575,1460.450058,2184.18819
1,102.727588,5.056161,0.0,1272.779723,15.478709,0.406682,5.634032,984.130405,109.250104,3.437551,...,190.571668,12.613419,14.762337,190.586177,190.586177,5070.487311,117.583316,190.804025,1460.450058,2184.18819
2,132.423187,6.871817,0.0,1316.219424,19.426499,0.540589,7.285557,1040.659077,140.28479,4.509247,...,193.976224,3.404556,16.017976,28.347933,193.990733,5070.487311,83.563255,201.146571,379.426146,2184.18819
3,163.844687,8.809015,0.0,1356.118035,23.528035,0.684416,9.031314,1078.784947,173.455129,5.713454,...,203.330297,9.354073,12.758629,27.520966,203.344805,5070.487311,76.633068,160.196323,351.000349,2184.18819
4,196.640551,10.839707,0.0,1384.617043,27.723802,0.843121,10.925856,1106.261078,208.516413,7.071451,...,208.5005,5.170203,14.524276,30.542252,42.872209,5070.487311,86.756894,163.389963,364.536534,542.816109


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

northeast_X_train, northeast_X_test, northeast_y_train, northeast_y_test = train_test_split(northeast_X, northeast_y, test_size = .25)
northeast_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(northeast_X_train), columns = northeast_X_train.columns) + 1
northeast_transformed_X_test = pd.DataFrame(Normalizer().transform(northeast_X_test), columns = northeast_X_test.columns) + 1

selected_col_idx = SelectKBest(chi2, k = 15).fit(northeast_transformed_X_train, northeast_y_train).get_support(indices = True)
northeast_selected_X_train = northeast_transformed_X_train.iloc[:, selected_col_idx]
northeast_selected_X_test = northeast_transformed_X_test.iloc[:, selected_col_idx]

northeast_selected_X_train = northeast_selected_X_train.drop(['New York_testing_rate_15', 'Rhode Island_testing_rate_15', 'Max_cases_norm100k', 'AVG_cases_norm100k', 'Max_Massachusetts_testing_rate'], axis = 1)
northeast_selected_X_test = northeast_selected_X_test.drop(['New York_testing_rate_15', 'Rhode Island_testing_rate_15', 'Max_cases_norm100k', 'AVG_cases_norm100k', 'Max_Massachusetts_testing_rate'], axis = 1)
corr_matrix = northeast_selected_X_train.corr('spearman')
for row_idx, row in corr_matrix.iterrows():
    for col_idx, cor in row.iteritems():
        if col_idx != row_idx and cor > .8:
            print(str(row_idx) + ' ' + str(col_idx) + ' = ' + str(corr_matrix.loc[row_idx, col_idx]))
print(northeast_selected_X_test.columns)

In [20]:
CHOSEN_COLUMNS = ['AVG_Massachusetts_testing_rate', 'AVG_Rhode Island_testing_rate', 'Connecticut_testing_rate_15', 'Massachusetts_testing_rate_15', 'Max_Rhode Island_testing_rate', 'Min_Rhode Island_testing_rate', 'Min_cases_norm100k', 'Vermont_testing_rate_15', 'cases_norm100k_15', 'testing_rate_15']
northeast_X_train, northeast_X_test, northeast_y_train, northeast_y_test = train_test_split(northeast_X, northeast_y, test_size = .25)
northeast_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(northeast_X_train), columns = northeast_X_train.columns)
northeast_transformed_X_test = pd.DataFrame(Normalizer().transform(northeast_X_test), columns = northeast_X_test.columns)
northeast_selected_X_train = northeast_transformed_X_train[CHOSEN_COLUMNS]
northeast_selected_X_test = northeast_transformed_X_test[CHOSEN_COLUMNS]

In [25]:
reg = linear_model.LinearRegression().fit(northeast_selected_X_train, northeast_y_train)
coef = {}
for idx, name in enumerate(northeast_selected_X_train.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'AVG_Massachusetts_testing_rate': 4107713.7155113053, 'AVG_Rhode Island_testing_rate': 107860.42071430851, 'Connecticut_testing_rate_15': 949679.0942750452, 'Massachusetts_testing_rate_15': -2758473.786415335, 'Max_Rhode Island_testing_rate': 17526.841207527264, 'Min_Rhode Island_testing_rate': 553227.6404521097, 'Min_cases_norm100k': -3124896.420579755, 'Vermont_testing_rate_15': 549188.9129537786, 'cases_norm100k_15': 3179712.8092456074, 'testing_rate_15': -2388055.5232510287}


In [26]:
y_pred = reg.predict(northeast_selected_X_test)
print("The Explained Variance: %.2f" % reg.score(northeast_selected_X_test, northeast_y_test))
print("The Mean Absolute Error: %.2f cases" % mean_absolute_error(northeast_y_test, y_pred))
print("The Median Absolute Error: %.2f cases" % median_absolute_error(northeast_y_test, y_pred))

The Explained Variance: 0.77
The Mean Absolute Error: 106712.21 cases
The Median Absolute Error: 58984.86 cases


# South Model

In [27]:
south_flattened_df = flatten_df(us_south_df, ['date'])
print(south_flattened_df.columns)
print(south_flattened_df.shape)
south_flattened_df.head()

Index(['Arkansas_cases_norm100k', 'Arkansas_deaths_norm100k',
       'Arkansas_recoveries_norm100k', 'Arkansas_testing_rate',
       'Delaware_cases_norm100k', 'Delaware_deaths_norm100k',
       'Delaware_recoveries_norm100k', 'Delaware_testing_rate',
       'District of Columbia_cases_norm100k',
       'District of Columbia_deaths_norm100k',
       'District of Columbia_recoveries_norm100k',
       'District of Columbia_testing_rate', 'Florida_cases_norm100k',
       'Florida_deaths_norm100k', 'Florida_recoveries_norm100k',
       'Florida_testing_rate', 'Georgia_cases_norm100k',
       'Georgia_deaths_norm100k', 'Georgia_recoveries_norm100k',
       'Georgia_testing_rate', 'Kentucky_cases_norm100k',
       'Kentucky_deaths_norm100k', 'Kentucky_recoveries_norm100k',
       'Kentucky_testing_rate', 'Louisiana_cases_norm100k',
       'Louisiana_deaths_norm100k', 'Louisiana_recoveries_norm100k',
       'Louisiana_testing_rate', 'Maryland_cases_norm100k',
       'Maryland_deaths_norm100k'

Unnamed: 0,Arkansas_cases_norm100k,Arkansas_deaths_norm100k,Arkansas_recoveries_norm100k,Arkansas_testing_rate,Delaware_cases_norm100k,Delaware_deaths_norm100k,Delaware_recoveries_norm100k,Delaware_testing_rate,District of Columbia_cases_norm100k,District of Columbia_deaths_norm100k,...,West Virginia_testing_rate,cases,cases_norm100k,deaths_norm100k,recoveries_norm100k,testing_rate,Alabama_cases_norm100k,Alabama_deaths_norm100k,Alabama_recoveries_norm100k,Alabama_testing_rate
0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.708467,0.0,...,,95.0,1.709144,0.009312,0.0,,,,,
1,0.033137,0.0,0.0,,0.102694,0.0,0.0,,1.416934,0.0,...,,144.0,2.977917,0.009312,0.0,,,,,
2,0.19882,0.0,0.0,,0.102694,0.0,0.0,,1.416934,0.0,...,,216.0,4.020499,0.01873,0.0,,,,,
3,0.19882,0.0,0.0,,0.410777,0.0,0.0,,1.416934,0.0,...,,317.0,5.521583,0.01873,0.0,,0.101975,0.0,0.0,
4,0.39764,0.0,0.0,,0.616166,0.0,0.0,,1.416934,0.0,...,,476.0,7.974398,0.056613,0.0,,0.122369,0.0,0.0,


In [28]:
south_X, south_y = sliding_window(south_flattened_df, 15, 7)
print(len(south_y))
print(south_X.shape)
print(south_y[:5])
south_X = south_X.fillna(south_X.mean())
print(south_X.shape)
south_X.head()

234
(234, 576)
[161140.0, 167301.0, 175455.0, 182320.0, 188388.0]
(234, 576)


Unnamed: 0,AVG_Alabama_cases_norm100k,AVG_Alabama_deaths_norm100k,AVG_Alabama_recoveries_norm100k,AVG_Alabama_testing_rate,AVG_Arkansas_cases_norm100k,AVG_Arkansas_deaths_norm100k,AVG_Arkansas_recoveries_norm100k,AVG_Arkansas_testing_rate,AVG_Delaware_cases_norm100k,AVG_Delaware_deaths_norm100k,...,recoveries_norm100k_15,recoveries_norm100k_2,recoveries_norm100k_3,recoveries_norm100k_5,recoveries_norm100k_7,testing_rate_15,testing_rate_2,testing_rate_3,testing_rate_5,testing_rate_7
0,20.406804,0.52007,0.0,596.316504,10.130987,0.194402,2.617798,793.824604,37.545031,0.814708,...,221.279697,32.153374,46.260321,221.279697,221.279697,3377.430112,69.766205,133.470299,967.718875,1451.775542
1,25.594039,0.665188,0.0,628.92723,13.597084,0.267303,3.698053,806.200554,51.333451,1.129637,...,244.093191,22.813495,54.966869,244.093191,244.093191,3377.430112,31.863555,101.62976,967.718875,1451.775542
2,30.270412,0.811426,0.0,658.36379,17.173636,0.34904,4.908647,820.122772,65.498416,1.506183,...,275.058888,30.965697,53.779191,100.039512,275.058888,3377.430112,40.528768,72.392323,205.862622,1451.775542
3,34.448357,0.958561,0.0,683.167065,20.904826,0.430777,6.21865,834.676502,81.354414,1.923806,...,290.943911,15.885023,46.850719,101.817588,290.943911,3377.430112,42.904183,83.43295,185.062711,1451.775542
4,40.951613,1.166589,0.0,715.172764,24.744262,0.514723,7.771656,848.642035,98.702903,2.382507,...,369.522208,78.578298,94.46332,148.242511,194.502832,3377.430112,52.354426,95.258609,167.650932,301.12123


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
for column in south_X.columns:
    if south_X[column].isnull().values.all():
        print(column)
south_X = south_X.drop(['Florida_recoveries_norm100k_15', 'Florida_recoveries_norm100k_7', 'Georgia_recoveries_norm100k_15', 'Georgia_recoveries_norm100k_7'], axis = 1)
print(south_X.isnull().values.any())
south_X_train, south_X_test, south_y_train, south_y_test = train_test_split(south_X, south_y, test_size = .25)
south_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(south_X_train), columns = south_X_train.columns) + 1
south_transformed_X_test = pd.DataFrame(Normalizer().transform(south_X_test), columns = south_X_test.columns) + 1

selected_col_idx = SelectKBest(chi2, k = 15).fit(south_transformed_X_train, south_y_train).get_support(indices = True)
south_selected_X_train = south_transformed_X_train.iloc[:, selected_col_idx]
south_selected_X_test = south_transformed_X_test.iloc[:, selected_col_idx]

south_new_selected_X_train = south_selected_X_train.drop(['Louisiana_testing_rate_15', 'Tennessee_testing_rate_15', 'Oklahoma_testing_rate_15', 'testing_rate_15', 'Arkansas_testing_rate_15', 'Delaware_testing_rate_15', 'District of Columbia_testing_rate_15'], axis = 1)
#northeast_selected_X_test = northeast_selected_X_test.drop(['New York_testing_rate_15', 'Rhode Island_testing_rate_15', 'Max_cases_norm100k', 'AVG_cases_norm100k', 'Max_Massachusetts_testing_rate'], axis = 1)
corr_matrix = south_new_selected_X_train.corr('spearman')
for row_idx, row in corr_matrix.iterrows():
    for col_idx, cor in row.iteritems():
        if col_idx != row_idx and cor > .8:
            print(str(row_idx) + ' ' + str(col_idx) + ' = ' + str(corr_matrix.loc[row_idx, col_idx]))
print(south_new_selected_X_train.columns)

In [29]:
CHOSEN_COLUMNS = ['Florida_testing_rate_15', 'Kentucky_testing_rate_15', 'Maryland_testing_rate_15', 'Min_District of Columbia_testing_rate', 'Min_cases_norm100k', 'North Carolina_testing_rate_15', 'West Virginia_testing_rate_15', 'cases_norm100k_15']
south_X_train, south_X_test, south_y_train, south_y_test = train_test_split(south_X, south_y, test_size = .25)
south_selected_X_train = south_X_train[CHOSEN_COLUMNS]
south_selected_X_test = south_X_test[CHOSEN_COLUMNS]
south_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(south_selected_X_train), columns = south_selected_X_train.columns)
south_transformed_X_test = pd.DataFrame(Normalizer().transform(south_selected_X_test), columns = south_selected_X_test.columns)

In [30]:
reg = linear_model.LinearRegression().fit(south_transformed_X_train, south_y_train)
coef = {}
for idx, name in enumerate(south_transformed_X_train.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'Florida_testing_rate_15': 1758818.5367659072, 'Kentucky_testing_rate_15': 4278223.350714569, 'Maryland_testing_rate_15': 1539257.0196117922, 'Min_District of Columbia_testing_rate': 16844518.012136456, 'Min_cases_norm100k': -7494493.273423731, 'North Carolina_testing_rate_15': -7081328.155664981, 'West Virginia_testing_rate_15': 1626994.5245874093, 'cases_norm100k_15': 2926973.9170898455}


In [31]:
y_pred = reg.predict(south_transformed_X_test)
print("The Explained Variance: %.2f" % reg.score(south_transformed_X_test, south_y_test))
print("The Mean Absolute Error: %.2f cases" % mean_absolute_error(south_y_test, y_pred))
print("The Median Absolute Error: %.2f cases" % median_absolute_error(south_y_test, y_pred))

The Explained Variance: 0.94
The Mean Absolute Error: 260359.23 cases
The Median Absolute Error: 164525.35 cases


# Midwest Model

In [32]:
midwest_flattened_df = flatten_df(us_midwest_df, ['date'])
print(midwest_flattened_df.columns)
print(midwest_flattened_df.shape)
midwest_flattened_df.head()

Index(['Illinois_cases_norm100k', 'Illinois_deaths_norm100k',
       'Illinois_recoveries_norm100k', 'Illinois_testing_rate', 'cases',
       'cases_norm100k', 'deaths_norm100k', 'recoveries_norm100k',
       'testing_rate', 'Indiana_cases_norm100k', 'Indiana_deaths_norm100k',
       'Indiana_recoveries_norm100k', 'Indiana_testing_rate',
       'Iowa_cases_norm100k', 'Iowa_deaths_norm100k',
       'Iowa_recoveries_norm100k', 'Iowa_testing_rate',
       'Kansas_cases_norm100k', 'Kansas_deaths_norm100k',
       'Kansas_recoveries_norm100k', 'Kansas_testing_rate',
       'Michigan_cases_norm100k', 'Michigan_deaths_norm100k',
       'Michigan_recoveries_norm100k', 'Michigan_testing_rate',
       'Minnesota_cases_norm100k', 'Minnesota_deaths_norm100k',
       'Minnesota_recoveries_norm100k', 'Minnesota_testing_rate',
       'Missouri_cases_norm100k', 'Missouri_deaths_norm100k',
       'Missouri_recoveries_norm100k', 'Missouri_testing_rate',
       'Nebraska_cases_norm100k', 'Nebraska_deaths

Unnamed: 0,Illinois_cases_norm100k,Illinois_deaths_norm100k,Illinois_recoveries_norm100k,Illinois_testing_rate,cases,cases_norm100k,deaths_norm100k,recoveries_norm100k,testing_rate,Indiana_cases_norm100k,...,Ohio_recoveries_norm100k,Ohio_testing_rate,South Dakota_cases_norm100k,South Dakota_deaths_norm100k,South Dakota_recoveries_norm100k,South Dakota_testing_rate,Wisconsin_cases_norm100k,Wisconsin_deaths_norm100k,Wisconsin_recoveries_norm100k,Wisconsin_testing_rate
0,0.007892,,,,1.0,0.007892,0.0,0.0,,,...,,,,,,,,,,
1,0.007892,,,,1.0,0.007892,0.0,0.0,,,...,,,,,,,,,,
2,0.007892,,,,1.0,0.007892,0.0,0.0,,,...,,,,,,,,,,
3,0.007892,,,,1.0,0.007892,0.0,0.0,,,...,,,,,,,,,,
4,0.007892,,,,1.0,0.007892,0.0,0.0,,,...,,,,,,,,,,


In [33]:
midwest_X, midwest_y = sliding_window(midwest_flattened_df, 15, 7)
print(len(midwest_y))
print(midwest_X.shape)
print(midwest_y[:5])
midwest_X = midwest_X.fillna(midwest_X.mean())
print(midwest_X.shape)
midwest_X.head()

241
(241, 416)
[81623.0, 85660.0, 89769.0, 95288.0, 100454.0]
(241, 416)


Unnamed: 0,AVG_Illinois_cases_norm100k,AVG_Illinois_deaths_norm100k,AVG_Illinois_recoveries_norm100k,AVG_Illinois_testing_rate,AVG_Indiana_cases_norm100k,AVG_Indiana_deaths_norm100k,AVG_Indiana_recoveries_norm100k,AVG_Indiana_testing_rate,AVG_Iowa_cases_norm100k,AVG_Iowa_deaths_norm100k,...,recoveries_norm100k_15,recoveries_norm100k_2,recoveries_norm100k_3,recoveries_norm100k_5,recoveries_norm100k_7,testing_rate_15,testing_rate_2,testing_rate_3,testing_rate_5,testing_rate_7
0,0.287252,0.000986,0.015783,27303.66553,0.248804,0.00557,0.0,16219.172119,0.534853,0.0,...,0.032958,0.0,0.0,0.0,0.0,3891.381491,278.495367,556.546182,1113.613988,1671.497105
1,0.371954,0.001754,0.014029,27303.66553,0.285526,0.008252,0.0,16219.172119,0.577554,0.0,...,0.0,-0.032958,-0.032958,-0.032958,-0.032958,3891.381491,278.495367,556.546182,1113.613988,1671.497105
2,0.593443,0.004735,0.012626,27303.66553,0.346097,0.010398,0.0,16219.172119,0.659256,0.0,...,0.0,0.0,-0.032958,-0.032958,-0.032958,3891.381491,278.495367,556.546182,1113.613988,1671.497105
3,0.900686,0.007892,0.011479,27303.66553,0.430764,0.012153,0.0,16219.172119,0.728985,0.0,...,0.0,0.0,0.0,-0.032958,-0.032958,3891.381491,278.495367,556.546182,1113.613988,1671.497105
4,1.296315,0.01118,0.010522,27303.66553,0.553309,0.016092,0.0,16219.172119,0.847842,0.0,...,0.0,0.0,0.0,-0.032958,-0.032958,3891.381491,278.495367,556.546182,1113.613988,1671.497105


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
for column in midwest_X.columns:
    if midwest_X[column].isnull().values.all():
        print(column)
midwest_X = midwest_X.drop(['Illinois_recoveries_norm100k_15', 'Missouri_recoveries_norm100k_15'], axis = 1)
print(midwest_X.isnull().values.any())
midwest_X_train, midwest_X_test, midwest_y_train, midwest_y_test = train_test_split(midwest_X, midwest_y, test_size = .25)
midwest_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(midwest_X_train), columns = midwest_X_train.columns) + 1
midwest_transformed_X_test = pd.DataFrame(Normalizer().transform(midwest_X_test), columns = midwest_X_test.columns) + 1

selected_col_idx = SelectKBest(chi2, k = 15).fit(midwest_transformed_X_train, midwest_y_train).get_support(indices = True)
midwest_selected_X_train = midwest_transformed_X_train.iloc[:, selected_col_idx]
midwest_selected_X_test = midwest_transformed_X_test.iloc[:, selected_col_idx]

midwest_new_selected_X_train = midwest_selected_X_train.drop(['Michigan_testing_rate_15', 'AVG_North Dakota_testing_rate', 'Illinois_testing_rate_15', 'North Dakota_testing_rate_15', 'Nebraska_testing_rate_15'], axis = 1)
#northeast_selected_X_test = northeast_selected_X_test.drop(['New York_testing_rate_15', 'Rhode Island_testing_rate_15', 'Max_cases_norm100k', 'AVG_cases_norm100k', 'Max_Massachusetts_testing_rate'], axis = 1)
corr_matrix = midwest_new_selected_X_train.corr('spearman')
for row_idx, row in corr_matrix.iterrows():
    for col_idx, cor in row.iteritems():
        if col_idx != row_idx and cor > .8:
            print(str(row_idx) + ' ' + str(col_idx) + ' = ' + str(corr_matrix.loc[row_idx, col_idx]))
#print(corr_matrix['Nebraska_testing_rate_15'].sum())
#print(corr_matrix['Wisconsin_testing_rate_15'].sum())
print(midwest_new_selected_X_train.columns)

In [34]:
CHOSEN_COLUMNS = ['Indiana_testing_rate_15', 'Max_North Dakota_testing_rate', 'Min_Illinois_testing_rate', 'Min_North Dakota_testing_rate', 'Minnesota_testing_rate_15', 'Missouri_testing_rate_15', 'North Dakota_testing_rate_7', 'Ohio_testing_rate_15', 'Wisconsin_testing_rate_15', 'testing_rate_15']
midwest_X_train, midwest_X_test, midwest_y_train, midwest_y_test = train_test_split(midwest_X, midwest_y, test_size = .25)
midwest_selected_X_train = midwest_X_train[CHOSEN_COLUMNS]
midwest_selected_X_test = midwest_X_test[CHOSEN_COLUMNS]
midwest_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(midwest_selected_X_train), columns = midwest_selected_X_train.columns)
midwest_transformed_X_test = pd.DataFrame(Normalizer().transform(midwest_selected_X_test), columns = midwest_selected_X_test.columns)

In [35]:
reg = linear_model.LinearRegression().fit(midwest_transformed_X_train, midwest_y_train)
coef = {}
for idx, name in enumerate(midwest_transformed_X_train.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'Indiana_testing_rate_15': -778411.1311789437, 'Max_North Dakota_testing_rate': -10565200.074391007, 'Min_Illinois_testing_rate': -6484411.025975477, 'Min_North Dakota_testing_rate': -4210772.707265199, 'Minnesota_testing_rate_15': -3479016.679890118, 'Missouri_testing_rate_15': -8634741.309743261, 'North Dakota_testing_rate_7': -4227511.904817755, 'Ohio_testing_rate_15': -23865066.32276553, 'Wisconsin_testing_rate_15': 102298.72724636551, 'testing_rate_15': 1117876.1483829645}


In [36]:
y_pred = reg.predict(midwest_transformed_X_test)
print("The Explained Variance: %.2f" % reg.score(midwest_transformed_X_test, midwest_y_test))
print("The Mean Absolute Error: %.2f cases" % mean_absolute_error(midwest_y_test, y_pred))
print("The Median Absolute Error: %.2f cases" % median_absolute_error(midwest_y_test, y_pred))

The Explained Variance: 0.67
The Mean Absolute Error: 504090.82 cases
The Median Absolute Error: 276273.36 cases


# West Model

In [37]:
west_flattened_df = flatten_df(us_west_df, ['date'])
print(west_flattened_df.columns)
print(west_flattened_df.shape)
west_flattened_df.head()

Index(['Washington_cases_norm100k', 'Washington_deaths_norm100k',
       'Washington_recoveries_norm100k', 'Washington_testing_rate', 'cases',
       'cases_norm100k', 'deaths_norm100k', 'recoveries_norm100k',
       'testing_rate', 'Arizona_cases_norm100k', 'Arizona_deaths_norm100k',
       'Arizona_recoveries_norm100k', 'Arizona_testing_rate',
       'California_cases_norm100k', 'California_deaths_norm100k',
       'California_recoveries_norm100k', 'California_testing_rate',
       'Alaska_cases_norm100k', 'Alaska_deaths_norm100k',
       'Alaska_recoveries_norm100k', 'Alaska_testing_rate',
       'Colorado_cases_norm100k', 'Colorado_deaths_norm100k',
       'Colorado_recoveries_norm100k', 'Colorado_testing_rate',
       'Hawaii_cases_norm100k', 'Hawaii_deaths_norm100k',
       'Hawaii_recoveries_norm100k', 'Hawaii_testing_rate',
       'Idaho_cases_norm100k', 'Idaho_deaths_norm100k',
       'Idaho_recoveries_norm100k', 'Idaho_testing_rate',
       'Montana_cases_norm100k', 'Montana_

Unnamed: 0,Washington_cases_norm100k,Washington_deaths_norm100k,Washington_recoveries_norm100k,Washington_testing_rate,cases,cases_norm100k,deaths_norm100k,recoveries_norm100k,testing_rate,Arizona_cases_norm100k,...,Oregon_recoveries_norm100k,Oregon_testing_rate,Utah_cases_norm100k,Utah_deaths_norm100k,Utah_recoveries_norm100k,Utah_testing_rate,Wyoming_cases_norm100k,Wyoming_deaths_norm100k,Wyoming_recoveries_norm100k,Wyoming_testing_rate
0,0.013132,,,,1.0,0.013132,0.0,0.0,,,...,,,,,,,,,,
1,0.013132,,,,1.0,0.013132,0.0,0.0,,,...,,,,,,,,,,
2,0.013132,,,,1.0,0.013132,0.0,0.0,,,...,,,,,,,,,,
3,0.013132,,,,1.0,0.013132,0.0,0.0,,,...,,,,,,,,,,
4,0.013132,,,,4.0,0.031933,0.0,0.0,,0.013739,...,,,,,,,,,,


In [38]:
west_X, west_y = sliding_window(west_flattened_df, 15, 7)
print(len(west_y))
print(west_X.shape)
print(west_y[:5])
west_X = west_X.fillna(west_X.mean())
print(west_X.shape)
west_X.head()

244
(244, 448)
[4251.0, 54999.0, 57037.0, 59384.0, 61435.0]
(244, 448)


Unnamed: 0,AVG_Alaska_cases_norm100k,AVG_Alaska_deaths_norm100k,AVG_Alaska_recoveries_norm100k,AVG_Alaska_testing_rate,AVG_Arizona_cases_norm100k,AVG_Arizona_deaths_norm100k,AVG_Arizona_recoveries_norm100k,AVG_Arizona_testing_rate,AVG_California_cases_norm100k,AVG_California_deaths_norm100k,...,recoveries_norm100k_15,recoveries_norm100k_2,recoveries_norm100k_3,recoveries_norm100k_5,recoveries_norm100k_7,testing_rate_15,testing_rate_2,testing_rate_3,testing_rate_5,testing_rate_7
0,0.034174,0.0,0.0,39724.427308,0.063698,0.0,0.013739,12497.697306,0.270572,0.009111,...,0.042056,0.0,0.0,0.010123,0.042056,3324.610862,242.814386,485.119506,972.39034,1458.212626
1,0.054679,0.0,0.0,39724.427308,0.073273,0.0,0.013739,12497.697306,0.33787,0.010123,...,0.042056,0.0,0.0,0.010123,0.042056,3324.610862,242.814386,485.119506,972.39034,1458.212626
2,0.068348,0.0,0.0,39724.427308,0.086659,0.0,0.013739,12497.697306,0.420318,0.011208,...,0.042056,0.0,0.0,0.0,0.010123,3324.610862,242.814386,485.119506,972.39034,1458.212626
3,0.117169,0.0,0.0,39724.427308,0.100096,0.0,0.013739,12497.697306,0.516477,0.013603,...,0.042056,0.0,0.0,0.0,0.010123,3324.610862,242.814386,485.119506,972.39034,1458.212626
4,0.205045,0.0,0.0,39724.427308,0.118153,0.0,0.012212,12497.697306,0.608757,0.015748,...,0.0,-0.042056,-0.042056,-0.042056,-0.042056,3324.610862,242.814386,485.119506,972.39034,1458.212626


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
for column in west_X.columns:
    if west_X[column].isnull().values.all():
        print(column)
west_X = west_X.drop(['California_recoveries_norm100k_15', 'Washington_recoveries_norm100k_15'], axis = 1)
print(west_X.isnull().values.any())
west_X_train, west_X_test, west_y_train, west_y_test = train_test_split(west_X, west_y, test_size = .25)
west_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(west_X_train), columns = west_X_train.columns) + 1
west_transformed_X_test = pd.DataFrame(Normalizer().transform(west_X_test), columns = west_X_test.columns) + 1

selected_col_idx = SelectKBest(chi2, k = 15).fit(west_transformed_X_train, west_y_train).get_support(indices = True)
west_selected_X_train = west_transformed_X_train.iloc[:, selected_col_idx]
west_selected_X_test = west_transformed_X_test.iloc[:, selected_col_idx]

west_new_selected_X_train = west_selected_X_train.drop(['testing_rate_15', 'Min_Alaska_testing_rate', 'New Mexico_testing_rate_15', 'Utah_testing_rate_15', 'AVG_Alaska_testing_rate'], axis = 1)
#northeast_selected_X_test = northeast_selected_X_test.drop(['New York_testing_rate_15', 'Rhode Island_testing_rate_15', 'Max_cases_norm100k', 'AVG_cases_norm100k', 'Max_Massachusetts_testing_rate'], axis = 1)
corr_matrix = west_new_selected_X_train.corr('spearman')
for row_idx, row in corr_matrix.iterrows():
    for col_idx, cor in row.iteritems():
        if col_idx != row_idx and cor > .8:
            print(str(row_idx) + ' ' + str(col_idx) + ' = ' + str(corr_matrix.loc[row_idx, col_idx]))
#print(corr_matrix['AVG_Alaska_testing_rate'].sum())
#print(corr_matrix['Max_Alaska_testing_rate'].sum())
#print(corr_matrix['Nevada_testing_rate_15'].sum())
#print(corr_matrix['Utah_testing_rate_15'].sum())
print(west_new_selected_X_train.columns)

In [39]:
CHOSEN_COLUMNS = ['Alaska_testing_rate_15', 'California_testing_rate_15', 'Colorado_testing_rate_15', 'Max_Alaska_testing_rate', 'Max_New Mexico_testing_rate', 'Max_Utah_testing_rate', 'Min_California_testing_rate', 'Montana_testing_rate_15', 'Nevada_testing_rate_15', 'Wyoming_testing_rate_15']
west_X_train, west_X_test, west_y_train, west_y_test = train_test_split(west_X, west_y, test_size = .25)
west_selected_X_train = west_X_train[CHOSEN_COLUMNS]
west_selected_X_test = west_X_test[CHOSEN_COLUMNS]
west_transformed_X_train = pd.DataFrame(Normalizer().fit_transform(west_selected_X_train), columns = west_selected_X_train.columns)
west_transformed_X_test = pd.DataFrame(Normalizer().transform(west_selected_X_test), columns = west_selected_X_test.columns)

In [40]:
reg = linear_model.LinearRegression().fit(west_transformed_X_train, west_y_train)
coef = {}
for idx, name in enumerate(west_transformed_X_train.columns):
    coef[name] = reg.coef_[idx]
print(coef)

{'Alaska_testing_rate_15': -2522109.754428714, 'California_testing_rate_15': -1570351.792930963, 'Colorado_testing_rate_15': -1277950.7758332458, 'Max_Alaska_testing_rate': 6118860.368100989, 'Max_New Mexico_testing_rate': -4203721.770991845, 'Max_Utah_testing_rate': -893751.2193031098, 'Min_California_testing_rate': -3995361.8301948756, 'Montana_testing_rate_15': 2214569.5414572023, 'Nevada_testing_rate_15': 2444576.104345589, 'Wyoming_testing_rate_15': 1533936.2818068794}


In [41]:
y_pred = reg.predict(west_transformed_X_test)
print("The Explained Variance: %.2f" % reg.score(west_transformed_X_test, west_y_test))
print("The Mean Absolute Error: %.2f cases" % mean_absolute_error(west_y_test, y_pred))
print("The Median Absolute Error: %.2f cases" % median_absolute_error(west_y_test, y_pred))

The Explained Variance: 0.81
The Mean Absolute Error: 155487.98 cases
The Median Absolute Error: 91248.15 cases
