## Data Preprocessing

In [1]:
# Importing the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
# Data Collection
def collect_data():

    # Data from the John Hopkins University Dataset on GitHub
    # https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

    # Defining the variables required
    filenames = ['time_series_covid19_confirmed_global.csv',
                'time_series_covid19_deaths_global.csv',
                'time_series_covid19_recovered_global.csv']

    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

    # Making the main dataframes required for the analysis
    confirmed_global = pd.read_csv(url + filenames[0])
    deaths_global = pd.read_csv(url + filenames[1])
    recovered_global = pd.read_csv(url + filenames[2])
    country_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')

    # Simple Data Cleaning - Removing and renaming the Columns

    # Removing the Province/State column, as it is pretty much not of any use
    confirmed_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    deaths_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    recovered_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    country_cases.drop(columns = ['Last_Update', 'Incident_Rate', 'People_Tested', 'People_Hospitalized', 'UID'], inplace = True)
    # Renaming the columns for easier access
    confirmed_global.rename(columns = {"Country/Region": "country"}, inplace = True)
    deaths_global.rename(columns = {"Country/Region": "country"}, inplace = True)
    recovered_global.rename(columns = {"Country/Region": "country"}, inplace = True)

    country_cases.rename(columns = {
        "Country_Region" : "country",
        "Confirmed": "confirmed",
        "Deaths": "deaths",
        "Recovered" : "recovered",
        "Active" : "active",
        "Mortality_Rate": "mortality"
    }, inplace = True)

    # Removing some duplicate values from the table
    confirmed_global = confirmed_global.groupby(['country'], as_index = False).sum()
    deaths_global = deaths_global.groupby(['country'], as_index = False).sum()
    recovered_global = recovered_global.groupby(['country'], as_index = False).sum()

    # This value is being changed as there was an error in the original dataset that had to be modified
    confirmed_global.at[178, '5/20/20'] = 251667

    return (confirmed_global, deaths_global, recovered_global, country_cases)

In [3]:
def get_new_cases(country):
    time_series = confirmed_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series = time_series.drop(['country'], axis = 1)
    time_series.index = [x for x in range(len(time_series))]
    return time_series

In [4]:
confirmed_global, deaths_global, recovered_global, country_cases = collect_data()
confirmed_global

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20,9/17/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,38520,38544,38572,38606,38641,38716,38772,38815,38855,38872
1,Albania,0,0,0,0,0,0,0,0,0,...,10553,10704,10860,11021,11185,11353,11520,11672,11816,11948
2,Algeria,0,0,0,0,0,0,0,0,0,...,46938,47216,47488,47752,48007,48254,48496,48734,48966,49194
3,Andorra,0,0,0,0,0,0,0,0,0,...,1261,1301,1301,1344,1344,1344,1438,1438,1483,1483
4,Angola,0,0,0,0,0,0,0,0,0,...,3033,3092,3217,3279,3335,3388,3439,3569,3675,3789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,27363,27919,28664,29256,29906,30574,31362,32250,33006,33843
184,Western Sahara,0,0,0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
185,Yemen,0,0,0,0,0,0,0,0,0,...,1994,1999,2003,2007,2009,2011,2013,2016,2019,2022
186,Zambia,0,0,0,0,0,0,0,0,0,...,12952,13112,13214,13323,13466,13539,13720,13819,13887,13928


In [5]:
country_choice = 'India'
dataset = get_new_cases(country_choice)

In [6]:
dataset

Unnamed: 0,date,cases
0,1/22/20,0
1,1/23/20,0
2,1/24/20,0
3,1/25/20,0
4,1/26/20,0
...,...,...
235,9/13/20,4846427
236,9/14/20,4846427
237,9/15/20,5020359
238,9/16/20,5020359


In [7]:
# Making the training dataset and test dataset
split_ratio = 0.8
train_size = int(split_ratio * len(dataset))
training_set = dataset.iloc[:train_size, 1:2].values
test_set = dataset.iloc[train_size:, 1:2].values

In [8]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)

In [9]:
timesteps = 14
X_train = []
y_train = []
for i in range(timesteps, train_size):
  X_train.append(training_set_scaled[i - timesteps: i, 0])
  y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

In [10]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

## Building the LSTM Model

In [11]:
# Importing the keras libraries and packages required
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional

In [12]:
# Initialize the RNN
model = Sequential()

In [13]:
# Adding the first LSTM layer and dropout regularization
model.add(Bidirectional(LSTM(units = 200, return_sequences = True, input_shape = (X_train.shape[1], 1))))

In [14]:
model.add(Bidirectional(LSTM(units = 200, activation = 'swish', return_sequences = True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units = 200, activation = 'swish', return_sequences = True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units = 200, activation = 'swish')))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

In [15]:
model.compile(optimizer = 'adam', loss = 'mse')

In [16]:
model.fit(X_train, y_train, epochs = 25, batch_size = 32)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7ffa50593940>

In [17]:
dataset = dataset.iloc[:, 1:2]
inputs = dataset[train_size - timesteps:].values
inputs = inputs.reshape(-1, 1)
inputs = sc.transform(inputs)
X_test = []
for i in range(timesteps, len(inputs)):
  X_test.append(inputs[i - timesteps:i, 0])
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [18]:
predicted_cases = model.predict(X_test)
predicted_cases = sc.inverse_transform(predicted_cases)
predicted_cases

array([[1756962.5],
       [1816300.8],
       [1876620.5],
       [1938022.6],
       [2000399. ],
       [2063383.8],
       [2127156. ],
       [2191704.8],
       [2257258.2],
       [2323572. ],
       [2390407. ],
       [2457692.2],
       [2525906.2],
       [2594710. ],
       [2663547.5],
       [2732838.8],
       [2802253.5],
       [2871600. ],
       [2941402.8],
       [3011718.8],
       [3082093. ],
       [3152686.8],
       [3223295. ],
       [3293581. ],
       [3364036. ],
       [3433852.2],
       [3504452.2],
       [3575561. ],
       [3647152.8],
       [3719447.8],
       [3792848.8],
       [3866930.8],
       [3941511.5],
       [4016595.5],
       [4092219.5],
       [4168526.5],
       [4245836. ],
       [4324665.5],
       [4404059.5],
       [4484967.5],
       [4565845. ],
       [4647317.5],
       [4729480.5],
       [4811976.5],
       [4894656.5],
       [4972605.5],
       [5055243.5],
       [5132309. ]], dtype=float32)

In [19]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
test_error = mape(test_set, predicted_cases)
print(f'Error is {test_error}%')

Error is 2.1322186810115435%


In [20]:
model.save(f'./models/{country_choice}___{round(test_error, 2)}___MAPE')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./models/India___2.13___MAPE/assets
