## COVID-19 Forecasting

Dataset found here: https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset

Goal is to forecast pandemic progression based on time series data

In [None]:
%%capture
!pip install pycountry
!pip install pmdarima
!pip install fbprophet

In [None]:
import pandas as pd

df = pd.read_csv('/content/covid19.csv')
df = df.drop(columns=['SNo'])
df.head()

In [None]:
df.info()
# print('\n')
# df.isnull().sum()

In [None]:
df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])

# group by date/time
datewise = df.groupby(['ObservationDate']).agg({'Confirmed': 'sum', 'Recovered': 'sum', 'Deaths': 'sum'})
datewise['Active'] = datewise['Confirmed'] - datewise['Recovered'] - datewise['Deaths']
datewise['Closed'] = datewise['Recovered'] + datewise['Deaths']

# group by country
countrywise = df[df['ObservationDate'] == df['ObservationDate'].max()].groupby(['Country/Region']).agg({'Confirmed': 'sum', 'Recovered': 'sum', 'Deaths': 'sum'})
countrywise['Active'] = countrywise['Confirmed'] - countrywise['Recovered'] - countrywise['Deaths']
countrywise['Closed'] = countrywise['Recovered'] + countrywise['Deaths']

In [None]:
print(f"Total Number of Countries with Cases: {len(df['Country/Region'].unique())}")
print(f"Total Number of Global Confirmed Cases: {int(datewise['Confirmed'].iloc[-1])}")
print(f"Total Number of Global Recovered Cases: {int(datewise['Recovered'].iloc[-1])}")
print(f"Total Number of Global Deaths: {int(datewise['Deaths'].iloc[-1])}")
print(f"Total Number of Global Active Cases: {int(datewise['Active'].iloc[-1])}")
print(f"Total Number of Global Closed Cases: {int(datewise['Closed'].iloc[-1])}\n")
print(f"Approximate Number of Daily Global Confirmed Cases: {int(datewise['Confirmed'].iloc[-1] / datewise.shape[0])}")
print(f"Approximate Number of Daily Global Recovered Cases: {int(datewise['Recovered'].iloc[-1] / datewise.shape[0])}")
print(f"Approximate Number of Daily Global Deaths: {int(datewise['Deaths'].iloc[-1] / datewise.shape[0])}")

In [None]:
import plotly.express as px

fig = px.bar(x=datewise.index, y=datewise['Active'], title="Distribution of Active Cases", labels={'x': 'Date', 'y': 'Number of Cases'})
fig.show()

fig2 = px.bar(x=datewise.index, y=datewise['Closed'], title="Distribution of Closed Cases", labels={'x': 'Date', 'y': 'Number of Cases'})
fig2.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=datewise.index, y=datewise['Confirmed'], name='Confirmed', marker_color='Blue'))
fig.add_trace(go.Bar(x=datewise.index, y=datewise['Deaths'], name='Deaths', marker_color='Red'))
fig.add_trace(go.Bar(x=datewise.index, y=datewise['Recovered'], name='Recovered', marker_color='Green'))
fig.update_layout(title="COVID-19 Confirmed, Deaths, Recovered Bar Graph", xaxis_title="Date", yaxis_title="Number of Cases")
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=datewise.index, y=datewise['Confirmed'], name='Confirmed', marker_color='Blue'))
fig2.add_trace(go.Scatter(x=datewise.index, y=datewise['Deaths'], name='Deaths', marker_color='Red'))
fig2.add_trace(go.Scatter(x=datewise.index, y=datewise['Recovered'], name='Recovered', marker_color='Green'))
fig.update_layout(title="COVID-19 Confirmed, Deaths, Recovered Line Graph", xaxis_title="Date", yaxis_title="Number of Cases")
fig2.show()

In [None]:
datewise['Mortality Rate']= (datewise['Deaths'] / datewise['Confirmed']) * 100
datewise['Recovery Rate'] = (datewise['Recovered'] / datewise['Confirmed']) * 100

fig = go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise['Mortality Rate'], marker_color='Red'))
fig.update_layout(title="Mortality Rate", xaxis_title="Date", yaxis_title="Rate")
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=datewise.index, y=datewise['Recovery Rate'], marker_color='Blue'))
fig2.update_layout(title="Recovery Rate", xaxis_title="Date", yaxis_title="Rate")
fig2.show()

Findings:
- Cases still on rise (as of May 2021)
- Mortality rate has decreased significantly
- Recovery rate outpacing mortality rate

In [None]:
top_25_c = countrywise.sort_values(['Confirmed'], ascending=False).head(25)
top_25_d = countrywise.sort_values(['Deaths'], ascending=False).head(25)

fig = px.bar(x=top_25_c['Confirmed'], y=top_25_c.index, title="25 Countries with Most Confirmed Cases", labels={'x': 'Number of Cases', 'y': 'Country'}, color=top_25_c.index)
fig.show()

fig2 = px.bar(x=top_25_d['Deaths'], y=top_25_d.index, title="25 Countries with Most Deaths", labels={'x': 'Number of Deaths', 'y': 'Country'}, color=top_25_c.index)
fig2.show()

In [None]:
top_25_act = countrywise.sort_values(['Active'], ascending=False).head(25)
top_25_cl = countrywise.sort_values(['Closed'], ascending=False).head(25)

fig = px.bar(x=top_25_act['Active'], y=top_25_act.index, title="25 Countries with Most Active Cases", labels={'x': 'Number of Cases', 'y': 'Country'}, color=top_25_act.index)
fig.show()

fig2 = px.bar(x=top_25_cl['Closed'], y=top_25_cl.index, title="25 Countries with Most Closed Cases", labels={'x': 'Number of Cases', 'y': 'Country'}, color=top_25_cl.index)
fig2.show()

In [None]:
countrywise['Mortality Rate']= (countrywise['Deaths'] / countrywise['Confirmed']) * 100
countrywise['Recovery Rate'] = (countrywise['Recovered'] / countrywise['Confirmed']) * 100

top_25_mr = countrywise[countrywise['Confirmed'] > 1000].sort_values(['Mortality Rate'], ascending=False).head(25)
top_25_rr = countrywise[countrywise['Confirmed'] > 1000].sort_values(['Recovery Rate'], ascending=False).head(25)

fig = px.bar(x=top_25_mr['Mortality Rate'], y=top_25_mr.index, title="25 Countries (1000+ Cases) with Highest Mortality Rate", labels={'x': 'Rate', 'y': 'Country'}, color=top_25_mr.index)
fig.show()

fig2 = px.bar(x=top_25_rr['Recovery Rate'], y=top_25_rr.index, title="25 Countries (1000+ Cases) with Highest Recovery Rate", labels={'x': 'Rate', 'y': 'Country'}, color=top_25_rr.index)
fig2.show()

In [None]:
bottom_25_mr = countrywise[countrywise['Confirmed'] > 1000].sort_values(['Mortality Rate'], ascending=False).tail(25)
bottom_25_rr = countrywise[countrywise['Confirmed'] > 1000].sort_values(['Recovery Rate'], ascending=False).tail(25)

fig = px.bar(x=bottom_25_mr['Mortality Rate'], y=bottom_25_mr.index, title="25 Countries (1000+ Cases) with Lowest Mortality Rate", labels={'x': 'Rate', 'y': 'Country'}, color=bottom_25_mr.index)
fig.show()

fig2 = px.bar(x=bottom_25_rr['Recovery Rate'], y=bottom_25_rr.index, title="25 Countries (1000+ Cases) with Lowest Recovery Rate", labels={'x': 'Rate', 'y': 'Country'}, color=bottom_25_rr.index)
fig2.show()

In [None]:
import pycountry

c = countrywise['Confirmed'].reset_index()
r = countrywise['Recovered'].reset_index()
d = countrywise['Deaths'].reset_index()

map_dfs = [c, r, d]
for _df in map_dfs:
  _df['Country/Region'].replace({'Mainland China': 'China'}, inplace=True)
  _df['Country/Region'].replace({'UK': 'United Kingdom'}, inplace=True)
  _df['Country/Region'].replace({'US': 'United States'}, inplace=True)

countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3
    
c['iso_alpha'] = c['Country/Region'].map(countries.get)
r['iso_alpha'] = r['Country/Region'].map(countries.get)
d['iso_alpha'] = d['Country/Region'].map(countries.get)

c = c[['iso_alpha', 'Confirmed', 'Country/Region']]
r = r[['iso_alpha', 'Recovered']]
d = d[['iso_alpha', 'Deaths']]

fig = px.scatter_geo(c, locations='iso_alpha', color='Country/Region', hover_name='iso_alpha', size='Confirmed', projection='natural earth', title="Worldwide Confirmed Cases")
fig.show()

Findings:
- US and India lead world in total confirmed, active, and closed cases with Brazil and Turkey close behind
- US recovery data redacted due to misreporting
- China data underreported
- Recovery rate is highest and mortality rate is lowest for many smaller countries
- Larger countries like UK, France, and Spain just entering pandemic

In [None]:
from sklearn.cluster import KMeans
import numpy as np

X1 = countrywise[['Mortality Rate', 'Recovery Rate']]

inertia = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=14)
    kmeans.fit(X1)
    inertia.append(kmeans.inertia_)

fig = px.line(x=np.arange(2, 11), y=inertia, title="Elbow Method", labels={'x': 'Number of Clusters', 'y': 'Inertia'}, markers=True)
fig.show()

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=14)
kmeans.fit(X1)

countrywise['Cluster'] = kmeans.predict(X1) + 1

print(f"Average Mortality Rate of Cluster 1: {countrywise[countrywise['Cluster'] == 1]['Mortality Rate'].mean()}")
print(f"Average Recovery Rate of Cluster 1: {countrywise[countrywise['Cluster'] == 1]['Recovery Rate'].mean()}\n")
print(f"Average Mortality Rate of Cluster 2: {countrywise[countrywise['Cluster'] == 2]['Mortality Rate'].mean()}")
print(f"Average Recovery Rate of Cluster 2: {countrywise[countrywise['Cluster'] == 2]['Recovery Rate'].mean()}\n")
print(f"Average Mortality Rate of Cluster 3: {countrywise[countrywise['Cluster'] == 3]['Mortality Rate'].mean()}")
print(f"Average Recovery Rate of Cluster 3: {countrywise[countrywise['Cluster'] == 3]['Recovery Rate'].mean()}\n")

print(f"Sample of Countries in Cluster 1: {list(countrywise[countrywise['Cluster'] == 1].sample(10).index)}")
print(f"Sample of Countries in Cluster 2: {list(countrywise[countrywise['Cluster'] == 2].sample(10).index)}")
print(f"Sample of Countries in Cluster 3: {list(countrywise[countrywise['Cluster'] == 3].sample(10).index)}")

In [None]:
fig = px.scatter(x=countrywise['Recovery Rate'], y=countrywise['Mortality Rate'], title="Country Clusters", labels={'x': 'Recovery Rate', 'y': 'Mortality Rate'}, color=countrywise['Cluster'])
fig.show()

Findings:
- At this point in pandemic, mortality rate is relatively controlled so clusters decided based on recovery rate more so than mortality rate
  - Cluster 1: High recovery --> passed worst of pandemic or just starting pandemic if too few data points
  - Cluster 2: Low recovery --> just starting pandemic 
  - Cluster 3: Medium recovery --> in middle of pandemic



In [None]:
from pmdarima.arima import auto_arima

train = datewise.iloc[:int(datewise.shape[0] * 0.95)]
test = datewise.iloc[int(datewise.shape[0] * 0.95):]

model = auto_arima(train['Confirmed'], trace=True, error_action='ignore', suppress_warnings=True, stepwise=True, seasonal=True)
model.fit(train['Confirmed'])

pred = model.predict(len(test))
test['ARIMA Model Prediction'] = pred

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=train.index, y=train['Confirmed'], name="Training Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test.index, y=test['Confirmed'], name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=test.index, y=test['ARIMA Model Prediction'], name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases ARIMA Model Prediction", xaxis_title="Date", yaxis_title="Confirmed Cases")
fig.show()

In [None]:
from prophet import Prophet

model2 = Prophet(interval_width=0.95, yearly_seasonality=False, daily_seasonality=False)
prophet_df = pd.DataFrame(zip(list(datewise.index), list(datewise['Confirmed'])), columns=['ds', 'y'])
model2.fit(prophet_df)

future_df = model2.make_future_dataframe(periods=15)
forecast = model2.predict(future_df)

print(model2.plot(forecast))
print('\n')
print(model2.plot_components(forecast))