Universal Imports

In [None]:
import pandas as pd
import numpy as np 
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', ValueWarning)
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
from datetime import date, timedelta
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pmdarima import auto_arima
from yellowbrick.regressor import prediction_error
from yellowbrick.regressor import residuals_plot

In [None]:
countries = ['India', 'USA', 'United Kingdom', 'Russia', 'Iran']

# PART 1: COMPARING DIFFERENT MODELS


Extracting data from Our World in Data and placing it in a dataframe.

In [None]:
csv_data = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
df = pd.read_csv(csv_data, usecols= ["location", "date", "new_cases", "new_tests", "new_deaths"])
df['date'] = pd.to_datetime(df.date)
df.dropna(inplace=True)
df.head()
df.isna()

Using different countries data

In [None]:
data_ind = df[df["location"] == "India"] #India
data_ind_features = data_ind.drop(['location', 'date', 'new_cases'], axis = 1)
data_ind_target = data_ind['new_cases'].copy()

data_usa = df[df['location'] == 'United States'] #United States
data_usa_features = data_usa.drop(['location', 'date', 'new_cases'], axis = 1)
data_usa_target = data_usa['new_cases'].copy()

data_uk = df[df['location'] == 'United Kingdom'] #United Kingdom
data_uk_features = data_uk.drop(['location', 'date', 'new_cases'], axis = 1)
data_uk_target = data_uk['new_cases'].copy()

data_russia = df[df['location'] == 'Russia'] #Russia
data_russia_features = data_russia.drop(['location', 'date', 'new_cases'], axis = 1)
data_russia_target = data_russia['new_cases'].copy()

data_iran = df[df['location'] == 'Iran'] #Iran
data_iran_features = data_iran.drop(['location', 'date', 'new_cases'], axis = 1)
data_iran_target = data_iran['new_cases'].copy()



Feature Scaling

In [None]:
scaler = StandardScaler()
data_ind_features_sc = scaler.fit_transform(data_ind_features)
data_usa_features_sc = scaler.fit_transform(data_usa_features)
data_uk_features_sc = scaler.fit_transform(data_uk_features)
data_russia_features_sc = scaler.fit_transform(data_russia_features)
data_iran_features_sc = scaler.fit_transform(data_iran_features)

Dividing into training set and test set

In [None]:
train_india_x, test_india_x, train_india_y, test_india_y  = train_test_split(data_ind_features_sc, data_ind_target, test_size = 0.2)
train_usa_x, test_usa_x , train_usa_y, test_usa_y = train_test_split(data_usa_features_sc, data_usa_target, test_size = 0.2)
train_uk_x, test_uk_x, train_uk_y, test_uk_y = train_test_split(data_uk_features_sc, data_uk_target, test_size = 0.2)
train_russia_x, test_russia_x, train_russia_y, test_russia_y = train_test_split(data_russia_features_sc, data_russia_target, test_size = 0.2)
train_iran_x, test_iran_x, train_iran_y, test_iran_y = train_test_split(data_iran_features_sc, data_iran_target, test_size = 0.2)

Creating Linear Regression Models for each country:   

In [None]:
india_lreg = LinearRegression()
usa_lreg = LinearRegression()
russia_lreg = LinearRegression()
uk_lreg = LinearRegression()
iran_lreg = LinearRegression()

Fitting Models

In [None]:
india_lreg.fit(train_india_x, train_india_y)
usa_lreg.fit(train_usa_x, train_usa_y)
russia_lreg.fit(train_russia_x, train_russia_y)
uk_lreg.fit(train_uk_x, train_uk_y)
iran_lreg.fit(train_iran_x, train_iran_y)

Calculating RMSE

In [None]:
def calcRMSE(model,x,y):
    predictions = model.predict(x)
    rmse = np.sqrt(mean_squared_error(y,predictions))
    return rmse

Visualizing

In [None]:
rmse_for_lreg =list()

india_lreg_rmse = calcRMSE(india_lreg, test_india_x, test_india_y)
usa_lreg_rmse = calcRMSE(usa_lreg, test_usa_x, test_usa_y)
russia_lreg_rmse = calcRMSE(russia_lreg, test_russia_x, test_russia_y)
uk_lreg_rmse = calcRMSE(uk_lreg, test_uk_x, test_uk_y)
iran_lreg_rmse = calcRMSE(iran_lreg, test_iran_x, test_iran_y)

rmse_for_lreg.extend([india_lreg_rmse, usa_lreg_rmse, uk_lreg_rmse, russia_lreg_rmse, iran_lreg_rmse])

print(rmse_for_lreg)

Creating Random Forest Regressor Models

In [None]:
india_rfr = RandomForestRegressor()
usa_rfr = RandomForestRegressor()
uk_rfr = RandomForestRegressor()
russia_rfr = RandomForestRegressor()
iran_rfr = RandomForestRegressor()

Fitting the models

In [None]:
india_rfr.fit(train_india_x, train_india_y)
usa_rfr.fit(train_usa_x, train_usa_y)
uk_rfr.fit(train_uk_x, train_uk_y)
russia_rfr.fit(train_russia_x, train_russia_y)
iran_rfr.fit(train_iran_x, train_iran_y)

Calculating RMSE

In [None]:
rmse_for_rfr = list()

india_rfr_rmse = calcRMSE(india_rfr, test_india_x, test_india_y)
usa_rfr_rmse = calcRMSE(usa_rfr, test_usa_x, test_usa_y)
uk_rfr_rmse = calcRMSE(uk_rfr, test_uk_x, test_uk_y)
russia_rfr_rmse = calcRMSE(russia_rfr, test_russia_x, test_russia_y)
iran_rfr_rmse = calcRMSE(iran_rfr, test_iran_x, test_iran_y)

rmse_for_rfr.extend([india_rfr_rmse, usa_rfr_rmse, uk_rfr_rmse, russia_rfr_rmse, iran_rfr_rmse])
print(rmse_for_rfr)

Visualizing the 2 algorithms and comparing them.

Training our model over test set.

In [None]:
pred_india_lreg = india_lreg.predict(test_india_x)
pred_usa_lreg = usa_lreg.predict(test_usa_x)
pred_uk_lreg = uk_lreg.predict(test_uk_x)
pred_russia_lreg = russia_lreg.predict(test_russia_x)
pred_iran_lreg = iran_lreg.predict(test_iran_x)

Plotting the curves of cases in the selected countries. 

In [None]:


plt.plot(data_ind['date'],data_ind['new_cases'],color = 'red', linewidth = 3)
plt.title('India', size = 10)
plt.show()

plt.plot(data_usa['date'], data_usa['new_cases'], color = 'blue', linewidth = 3)
plt.title('United States of America', size = 10)
plt.show()

plt.plot(data_uk['date'], data_uk['new_cases'], color = 'g' , linewidth = 3)
plt.title('United Kingdom', size = 10)
plt.show()

plt.plot(data_russia['date'], data_russia['new_cases'], color ='orange', linewidth = 3)
plt.title('Russia', size = 10)
plt.show()

plt.plot(data_iran['date'], data_iran['new_cases'], color = 'cyan', linewidth = 3)
plt.title('Iran', size = 10)
plt.show()

Prediction Error Graph

In [None]:
#SUBPLOT
f = plt.figure(figsize =(10,10))
f.suptitle("India")
viz_india_lreg = prediction_error(india_lreg, train_india_x, train_india_y, test_india_x, test_india_y, )


f1 = plt.figure(figsize =(10,10))
f1.suptitle("United States of America")
viz_usa_lreg = prediction_error(usa_lreg, train_usa_x, train_usa_y, test_usa_x, test_usa_y)

f2 = plt.figure(figsize =(10,10))
f2.suptitle("United Kingdom")
viz_uk_lreg = prediction_error(uk_lreg, train_uk_x, train_uk_y, test_uk_x, test_uk_y)

f3 = plt.figure(figsize =(10,10))
f3.suptitle("Russia")
viz_russia_lreg = prediction_error(russia_lreg, train_russia_x, train_russia_y, test_russia_x, test_russia_y)

f4 = plt.figure(figsize =(10,10))
f4.suptitle("Iran")
viz_iran_lreg = prediction_error(iran_lreg, train_iran_x, train_iran_y, test_iran_x, test_iran_y)



In [None]:

f_forest = plt.figure(figsize =(10,10))
f_forest.suptitle("India")
viz_india_rfr = prediction_error(india_rfr, train_india_x, train_india_y, test_india_x, test_india_y)

f1_forest = plt.figure(figsize=(10,10))
f1_forest.suptitle("United States of America")
viz_usa_rfr = prediction_error(usa_rfr, train_usa_x, train_usa_y, test_usa_x, test_usa_y)

f2_forest = plt.figure(figsize=(10,10))
f2_forest.suptitle("United Kingdom")
viz_uk_rfr = prediction_error(uk_rfr, train_uk_x, train_uk_y, test_uk_x, test_uk_y)

f3_forest = plt.figure(figsize = (10,10))
f3_forest.suptitle("Russia")
viz_russia_rfr = prediction_error(russia_rfr, train_russia_x, train_russia_y, test_russia_x, test_russia_y)

f4_forest = plt.figure(figsize = (10,10))
f4_forest.suptitle("Iran")
viz_iran_rfr = prediction_error(iran_rfr, train_iran_x, train_iran_y, test_iran_x, test_iran_y)

Comparing RMSE of Each Model of the 2 algorithms.

In [None]:
#ADD YTICK
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(countries , rmse_for_lreg, color = 'b', width = 0.25, align='edge' )

ax.bar(countries , rmse_for_rfr, color = 'g', width = -0.25, align='edge')
plt.legend(['Linear Regression','Random Forest Regressor'], prop={'size': 15})
plt.xlabel('Countries')
plt.ylabel('Root Mean Squared Error')
plt.title("RMSE Scores for Linear Regression and Random Forest Regressor Models")


From the 2 algorithms, we see that Random Forest performs better because it has a lower RMSE for all selected countries.

# Part 2: Forecasting using ARIMA

Taking Countries Location Data

In [None]:
csv_data = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
dataframe = pd.read_csv(csv_data, usecols= ["location", "date", "new_cases"])
dataframe['date'] = pd.to_datetime(dataframe.date)
dataframe.dropna(inplace=True)
dataframe.tail()


Extracting Data of selected countries

In [None]:
data_ind_arima = dataframe[dataframe['location'] == 'India']
data_usa_arima = dataframe[dataframe['location'] == 'United States']
data_uk_arima = dataframe[dataframe['location'] == 'United Kingdom']
data_russia_arima = dataframe[dataframe['location'] == 'Russia']
data_iran_arima = dataframe[dataframe['location'] == 'Iran']

Now we'll be getting the p,d,q values for each countries ARIMA Model, using auto_arima from pmdarima.

1. p,d,q for India - (5,2,5)
2. p,d,q for USA - (5,1,4)
3. p,d,q for UK -  (5,2,3)
4. p,d,q for Russia - (5,2,5)
5. p,d,q for Iran - (5,2,5)

In [None]:
india_param = (5,2,5)
usa_param = (5,1,4)
uk_param = (5,2,3)
russia_param = (5,2,5)
iran_param = (5,2,5)

In [None]:
def evaluateModel(data, param):
    X = data['new_cases'].values
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:len(X)]
    history = [x for x in train]
    predictions = list()
    # walk-forward validation
    for t in range(len(test)):
        model = ARIMA(history, order=param)
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
        print('predicted=%f, expected=%f' % (yhat, obs))
    rmse = np.sqrt(mean_squared_error(test, predictions))
    return rmse

Not to display on site just for internal testing.

In [None]:
india_arima_rmse = evaluateModel(data_ind_arima, india_param) 
print(india_arima_rmse)

In [None]:
usa_arima_rmse = evaluateModel(data_usa_arima, usa_param)
print(usa_arima_rmse)

In [None]:
uk_arima_rmse = evaluateModel(data_uk_arima, uk_param)
print(uk_arima_rmse)

In [None]:
russia_arima_rmse = evaluateModel(data_russia_arima, russia_param)
print(russia_arima_rmse)

In [None]:
iran_arima_rmse = evaluateModel(data_iran_arima, iran_param)
print(iran_arima_rmse)

Now comparing the RMSE of ARIMA and Other Models

In [None]:
rmse_for_arima = list()
rmse_for_arima.extend([india_arima_rmse, usa_arima_rmse, uk_arima_rmse, russia_arima_rmse, iran_arima_rmse])

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(countries , rmse_for_lreg, color = 'b', width = 0.25, align='edge' )

ax.bar(countries , rmse_for_rfr, color = 'g', width = -0.25, align='edge')
plt.legend(['Linear Regression','Random Forest Regressor'], prop={'size': 15})
plt.xlabel('Countries')
plt.ylabel('Root Mean Squared Error')
plt.title("RMSE Scores for Linear Regression and Random Forest Regressor Models")

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(countries , rmse_for_rfr, color = 'b', width = 0.25, align='edge' )

ax.bar(countries , rmse_for_arima, color = 'g', width = -0.25, align='edge')
plt.legend(['Random Forest Regressor', 'ARIMA Model'], prop={'size': 15})
plt.xlabel('Countries')
plt.ylabel('Root Mean Squared Error')
plt.title("RMSE Scores for Random Forest Regressor and ARIMA Models")

As you can see, ARIMA provides lowest RMSE value compared to RFR. So we will use ARIMA to forecast 7 days ahead for these countries.

Creating actual models for forecasting.

In [None]:
india_arima = ARIMA(data_ind_arima['new_cases'], order = (5,2,5))
india_arima_fit = india_arima.fit()

In [None]:
usa_arima = ARIMA(data_usa_arima['new_cases'], order = (5,1,4))
usa_arima_fit = usa_arima.fit()

In [None]:
uk_arima = ARIMA(data_uk_arima['new_cases'], order = (5,2,3))
uk_arima_fit = uk_arima.fit()

In [None]:
russia_arima = ARIMA(data_russia_arima['new_cases'], order = (5,2,5))
russia_arima_fit = russia_arima.fit()

In [None]:
iran_arima = ARIMA(data_iran_arima['new_cases'], order = (5,2,5))
iran_arima_fit = iran_arima.fit()

Calculating rmse for each country ARIMA

In [None]:
india_forecasts = india_arima_fit.forecast(steps = 7)
usa_forecasts = usa_arima_fit.forecast(steps = 7)
uk_forecasts = uk_arima_fit.forecast(steps = 7)
russia_forecasts = russia_arima_fit.forecast(steps = 7)
iran_forecasts = iran_arima_fit.forecast(steps = 7)

Creating a dataframe with the dates and forecasts.

In [None]:
dates = list()
for i in range(1,8):
    date = date.today()
    increment = timedelta(days = i)
    date = date + increment
    dates.append(date)
print(dates)


In [None]:
forecastdict = {'Date': dates, 'India': india_forecasts}
forecastdata = pd.DataFrame(forecastdict, columns = ['Date', 'India'])

print(forecastdata)

In [None]:
plt.plot(dates, india_forecast)
plt.show()
plt.plot(dates, usa_forecast)
plt.show()
plt.plot(dates, uk_forecasts)
plt.show()
plt.plot(dates, russia_forecasts)
plt.show()
plt.plot(dates, iran_forecasts)
plt.show()