In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.


In [2]:
from google.colab import files
files.upload()   # used to upload makeHistoricalData file

{}

In [3]:
import makeHistoricalData

r = 14
target = 'confirmed'
errors = {}
mean_errors = [] # mean on counties for each h
# we are learning the model on each county, so we don't need fixed data (they won't help us)
# here we create a list of the name of independant of time covariates to delete them later
fixed_covariates = ['age_40_59', 'some_college_or_higher', 'age_80_or_higher', 'less_than_high_school_diploma', 
                    'latitude', 'age_0_14', 'hospital_beds', 'diabetes', 'age_15_39', 'icu_beds', 'high_school_diploma_only', 
                    'Religious', 'airport_distance', 'smokers', 'total_college_population', 'area', 'female-percent', 'age_60_79', 
                    'ventilator_capacity', 'passenger_load', 'median_household_income', 'houses_density', 'party', 'total_population', 
                    'longitude', 'population_density', 'meat_plants']

# loop through all h
for h in range(1, 8):
    data = makeHistoricalData.makeHistoricalData(h, r, target, 'mrmr')
    counties = data.county_fips.unique()   # a list of all counties
    mean_error = 0
    # loop through counties, each time training the model on one county
    for i in range(len(counties)):
        temporal = data.loc[data['county_fips'] == counties[i]].copy()
        temporal.reset_index(drop=True, inplace=True)
        
        # deleting unnecessary columns
        del temporal['date of day t']
        del temporal['county_name']
        del temporal['state_name']
        del temporal['state_fips']
        del temporal['county_fips']
        temporal = temporal.drop(fixed_covariates, axis = 1)

        totalNumberOfDays = len(temporal)

        # last 14 days are for test dataset and the rest are for train dataset
        train = temporal.head(totalNumberOfDays-28)
        validation = temporal.loc[totalNumberOfDays-28:totalNumberOfDays-15, :]
        test = temporal.tail(14)

        X_train = train.drop('Target', axis=1)
        y_train = train['Target']

        X_val = validation.drop('Target', axis=1)
        y_val = validation['Target']

        X_test = test.drop('Target', axis=1)
        y_test = test['Target']
        
        ############################# Normalizing data
        X_train = X_train.values
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train = min_max_scaler.fit_transform(X_train)

        X_val = X_val.values
        min_max_scaler = preprocessing.MinMaxScaler()
        X_val = min_max_scaler.fit_transform(X_val)

        X_test = X_test.values
        min_max_scaler = preprocessing.MinMaxScaler()
        X_test = min_max_scaler.fit_transform(X_test)

        y_train = y_train.values
        min_max_scaler = preprocessing.MinMaxScaler()
        y_train = min_max_scaler.fit_transform(y_train.reshape(-1, 1))

        y_val = y_val.values
        min_max_scaler = preprocessing.MinMaxScaler()
        y_val = min_max_scaler.fit_transform(y_val.reshape(-1, 1))

        y_test = y_test.values
        min_max_scaler = preprocessing.MinMaxScaler()
        y_test = min_max_scaler.fit_transform(y_test.reshape(-1, 1))
        #############################
        
        ############################# reshaping data to feed to the model
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
        X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

        y_train = y_train.reshape((y_train.shape[0]), )
        y_val = y_val.reshape((y_val.shape[0]), )
        y_test = y_test.reshape((y_test.shape[0]), )
        #############################
        
        print('Working on county_fips = ', counties[i], ' and h = ', h)

        ############################# lstm model
        model = Sequential()
        model.add(LSTM(4, return_sequences=True, input_shape=(1, X_train.shape[2])))  # returns a sequence of vectors of dimension 32
        model.add(LSTM(256, dropout=0.2,recurrent_dropout=0.2, return_sequences=True))
        model.add(LSTM(256, dropout=0.2,recurrent_dropout=0.2, return_sequences=True))
        model.add(LSTM(128, dropout=0.2,recurrent_dropout=0.2, return_sequences=True))
        model.add(LSTM(128))
        model.add(Dense(1, activation='sigmoid'))

        # opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
        model.compile (
            loss='mean_squared_error',
            optimizer=keras.optimizers.Adam(0.001)
        )

        print(X_train.shape, X_val.shape, X_test.shape)
        print(y_train.shape, y_val.shape, y_test.shape)
        
        history = model.fit (
            X_train, y_train,
            epochs=50,
            batch_size=128,
            validation_data=(X_val, y_val),
            verbose=1,
            shuffle=False
        )
        #############################
        
        # fig = plt.figure()
        # plt.plot(history.history['loss'], label='train')
        # plt.plot(history.history['val_loss'], label='test')
        # plt.xlabel('Epoch')
        # plt.ylabel('Loss')
        # plt.legend()
        # plt.show()
        # fig_name = 'h=' + str(h) + ', county_fips=' + str(counties[i])
        # fig.savefig(fig_name, dpi=600, quality=100)

        # obtaining the predictions
        pred = model.predict(X_test)
        pred = min_max_scaler.inverse_transform(pred.reshape(-1, 1))
        y_test = min_max_scaler.inverse_transform(y_test.reshape(-1, 1))
        
        # obtaining the errors
        error = mean_squared_error(np.round(y_test), np.round(pred))
        mean_error += error
        key = str(counties[i])+'_'+str(h)
        errors[key] = error

    mean_errors.append(mean_error / float(len(counties)))

(66, 1, 9) (14, 1, 9) (14, 1, 9)
(66,) (14,) (14,)
Train on 66 samples, validate on 14 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Working on county_fips =  13021  and h =  1


KeyboardInterrupt: ignored

In [0]:
import json

with open('errors_per_county.json', 'w') as fp:
    json.dump(errors, fp, sort_keys=True, indent=4)