## Data Preprocessing

In [1]:
# Importing the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
# Data Collection
def collect_data():

    # Data from the John Hopkins University Dataset on GitHub
    # https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

    # Defining the variables required
    filenames = ['time_series_covid19_confirmed_global.csv',
                'time_series_covid19_deaths_global.csv',
                'time_series_covid19_recovered_global.csv']

    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

    # Making the main dataframes required for the analysis
    confirmed_global = pd.read_csv(url + filenames[0])
    deaths_global = pd.read_csv(url + filenames[1])
    recovered_global = pd.read_csv(url + filenames[2])
    country_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')

    # Simple Data Cleaning - Removing and renaming the Columns

    # Removing the Province/State column, as it is pretty much not of any use
    confirmed_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    deaths_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    recovered_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    country_cases.drop(columns = ['Last_Update', 'Incident_Rate', 'People_Tested', 'People_Hospitalized', 'UID'], inplace = True)
    # Renaming the columns for easier access
    confirmed_global.rename(columns = {"Country/Region": "country"}, inplace = True)
    deaths_global.rename(columns = {"Country/Region": "country"}, inplace = True)
    recovered_global.rename(columns = {"Country/Region": "country"}, inplace = True)

    country_cases.rename(columns = {
        "Country_Region" : "country",
        "Confirmed": "confirmed",
        "Deaths": "deaths",
        "Recovered" : "recovered",
        "Active" : "active",
        "Mortality_Rate": "mortality"
    }, inplace = True)

    # Removing some duplicate values from the table
    confirmed_global = confirmed_global.groupby(['country'], as_index = False).sum()
    deaths_global = deaths_global.groupby(['country'], as_index = False).sum()
    recovered_global = recovered_global.groupby(['country'], as_index = False).sum()

    # This value is being changed as there was an error in the original dataset that had to be modified
    confirmed_global.at[178, '5/20/20'] = 251667

    return (confirmed_global, deaths_global, recovered_global, country_cases)

In [3]:
confirmed_global, deaths_global, recovered_global, country_cases = collect_data()
confirmed_global

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20,9/17/20,9/18/20,9/19/20,9/20/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,38606,38641,38716,38772,38815,38855,38872,38883,38919,39044
1,Albania,0,0,0,0,0,0,0,0,0,...,11021,11185,11353,11520,11672,11816,11948,12073,12226,12385
2,Algeria,0,0,0,0,0,0,0,0,0,...,47752,48007,48254,48496,48734,48966,49194,49413,49623,49826
3,Andorra,0,0,0,0,0,0,0,0,0,...,1344,1344,1344,1438,1438,1483,1483,1564,1564,1564
4,Angola,0,0,0,0,0,0,0,0,0,...,3279,3335,3388,3439,3569,3675,3789,3848,3901,3991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,29256,29906,30574,31362,32250,33006,33843,34401,35003,35686
184,Western Sahara,0,0,0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
185,Yemen,0,0,0,0,0,0,0,0,0,...,2007,2009,2011,2013,2016,2019,2022,2024,2026,2026
186,Zambia,0,0,0,0,0,0,0,0,0,...,13323,13466,13539,13720,13819,13887,13928,14022,14070,14131


In [4]:
def get_new_cases(country):
    time_series = confirmed_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series = time_series.drop(['country'], axis = 1)
    time_series_cases = time_series['cases'].diff()
    time_series_cases = time_series_cases.replace(np.nan, 0)
    time_series = pd.DataFrame(data = {
        'date': time_series.date,
        'cases': time_series_cases
    })
    time_series.index = [x for x in range(len(time_series))]
    time_series.date = [x for x in range(len(time_series))]
    return time_series

def get_new_deaths(country):
    time_series = deaths_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series = time_series.drop(['country'], axis = 1)
    time_series_cases = time_series['cases'].diff()
    time_series_cases = time_series_cases.replace(np.nan, 0)
    time_series = pd.DataFrame(data = {
        'date': time_series.date,
        'cases': time_series_cases
    })
    time_series.index = [x for x in range(len(time_series))]
    time_series.date = [x for x in range(len(time_series))]
    return time_series

def get_new_recoveries(country):
    time_series = recovered_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series = time_series.drop(['country'], axis = 1)
    time_series_cases = time_series['cases'].diff()
    time_series_cases = time_series_cases.replace(np.nan, 0)
    time_series = pd.DataFrame(data = {
        'date': time_series.date,
        'cases': time_series_cases
    })
    time_series.index = [x for x in range(len(time_series))]
    time_series.date = [x for x in range(len(time_series))]
    return time_series

In [5]:
def create_time_series(country_name, param):
    time_series = None
    if param == 'confirmed':
        time_series = get_new_cases(country_name)
    elif param == 'deaths':
        time_series = get_new_deaths(country_name)
    elif param == 'recoveries':
        time_series = get_new_recoveries(country_name)
        
    # Removing the zero values 
    is_0 = time_series['cases'] != 0
    time_series = time_series[is_0]
    
    return time_series

dataset = create_time_series('India', 'confirmed')
dataset

Unnamed: 0,date,cases
8,8,1.0
11,11,1.0
12,12,1.0
40,40,2.0
42,42,23.0
...,...,...
237,237,90123.0
238,238,97894.0
239,239,96424.0
240,240,93337.0


In [6]:
# Making the training dataset and test dataset
split_ratio = 0.8
train_size = int(split_ratio * len(dataset))
training_set = dataset.iloc[:train_size, 1:2].values
test_set = dataset.iloc[train_size:, 1:2].values

In [7]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)

In [8]:
timesteps = 14
X_train = []
y_train = []
for i in range(timesteps, train_size):
  X_train.append(training_set_scaled[i - timesteps: i, 0])
  y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

In [9]:
dataset = dataset.iloc[:, 1:2]
inputs = dataset[train_size - timesteps:].values
inputs = inputs.reshape(-1, 1)
inputs = sc.transform(inputs)
X_test = []
for i in range(timesteps, len(inputs)):
  X_test.append(inputs[i - timesteps:i, 0])
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

## Building the LSTM Model

In [10]:
# Importing libraries and packages required
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional

# Importing sklearn parameter grid for Hyperparameter tuning
from sklearn.model_selection import ParameterGrid

In [11]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [12]:
# Setting the random seed for better model reproducibility
tf.random.set_seed(365)

# The main model building function
def build_model(units, activation, batch_size, n_layers, bidirectional, epochs):
    model = Sequential()
    
    if bidirectional:
        model.add(Bidirectional(LSTM(units=units, return_sequences=True, input_shape=(X_train.shape[1], 1))))
        
        for i in range(n_layers):
            model.add(Bidirectional(LSTM(units=units, activation=activation, return_sequences=True)))
        
        model.add(Bidirectional(LSTM(units=units, activation=activation)))
        model.add(Dense(units = 1))
        
    else:
        model.add(LSTM(units=units, return_sequences=True, input_shape=(X_train.shape[1], 1)))
        
        for i in range(n_layers):
            model.add(LSTM(units=units, activation=activation, return_sequences=True))
        
        model.add(LSTM(units=units, activation=activation))
        model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    
    predicted_cases = model.predict(X_test)
    predicted_cases = sc.inverse_transform(predicted_cases)
    
    error = mape(test_set, predicted_cases)
    
    return model, error

In [13]:
def hp_tuning():
    params = {
        'units': [150, 200, 250, 300],
        'activation': ['relu', 'swish'],
        'batch_size': [16, 32],
        'n_layers': [2, 3, 4],
        'bidirectional': [True]
    }
    
    params = ParameterGrid(params)

    model = None
    err = 1000
    arch = None

    for i in params:
        print(i)
        model_in, err_in = build_model(i['units'], i['activation'], i['batch_size'], i['n_layers'], i['bidirectional'], 30) 
        results.append((model_in, err_in))
        print(f'Model has an error of {err_in}%')
        if err_in < err:
            model = model_in
            err = err_in
            arch = i

    return model, err, arch

In [14]:
model, error = build_model(150, 'relu', 16, 3, True, 100)
error

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

14.491445347918924