In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import plotly.express as px
import warnings
import ipywidgets as widgets
from ipywidgets import interact, interactive, interact_manual
import plotly.graph_objects as go
import folium

# Supress scientific notation/warning
pd.set_option('display.float_format', lambda x: '%.5f' % x)
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# Confirmed Global Cases
confirmed_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
# Number of Death Cases - Global
deaths_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
# Number of recovered cases - Global
recovered_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
# US Confirmed Cases
us_confirmed_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
# US Death Cases
us_death_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")
# Country-specific Data
country_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv")

### Rename column names to lower case

In [3]:
confirmed_df.columns = confirmed_df.columns.str.lower()
deaths_df.columns = deaths_df.columns.str.lower()
recovered_df.columns = recovered_df.columns.str.lower()
us_confirmed_df.columns = us_confirmed_df.columns.str.lower()
us_death_df.columns = us_death_df.columns.str.lower()
country_df.columns = country_df.columns.str.lower()

### Explore the aggregate numbers

In [4]:
# Number of global confirmed cases. Use last column's value as they are the latest
print(f"Number of global confirmed cases:{confirmed_df.iloc[:,-1:].sum().values[0]:,}")

Number of global confirmed cases:33,641,553


In [5]:
# Number of global deads. Use last column's value as they are the latest
print(f"Number of global deads:{deaths_df.iloc[:,-1:].sum().values[0]:,}")

Number of global deads:1,007,755


In [6]:
# Number of global recovered cases. Use last column's value as they are the latest
print(f"Number of recovered cases:{recovered_df.iloc[:,-1:].sum().values[0]:,}")

Number of recovered cases:23,387,690


### Countries Sorted by total number of confirmed cases

In [7]:
def highlight_column(s, column_dict):
    """Change the color in columns if passed on column_dict"""
    if (s.name in column_dict.keys()):
        return ['background-color: {}'.format(column_dict[s.name])] * len(s)
    return [''] * len(s)

In [8]:
@interact
def number_of_rows(row_max=len(country_df)): 
    if(row_max<0):
        row_max=0
    return country_df.sort_values(by="confirmed", ascending=False).head(row_max).style.apply(highlight_column,column_dict={'confirmed':'grey','deaths':'red','recovered':'green'})

interactive(children=(IntSlider(value=188, description='row_max', max=564, min=-188), Output()), _dom_classes=…

In [9]:
@interact
def number_of_countries(row_max=len(country_df)): 
    if(row_max<0):
        row_max=0
    countries_to_show_df = country_df.sort_values(by="confirmed", ascending=False).head(row_max)
    fig = px.scatter(countries_to_show_df, x="confirmed", y="deaths",size="deaths", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
    fig.show()

interactive(children=(IntSlider(value=188, description='row_max', max=564, min=-188), Output()), _dom_classes=…

In [10]:
confirmed_df.rename(columns={"country/region": "country"}, inplace=True)

In [11]:
deaths_df.rename(columns={"country/region": "country"}, inplace=True)

In [12]:
# Number of confirmed cases
confirmed_df_y=confirmed_df.query("country=='US'").loc[:,'1/22/20'::].values[0]
# X values - dates
confirmed_df_x=confirmed_df.query("country=='US'").loc[:,'1/22/20'::].columns

In [13]:
# Number of deads
deaths_df_y=deaths_df.query("country=='US'").loc[:,'1/22/20'::].values[0]
# X values - dates
deaths_df_x=deaths_df.query("country=='US'").loc[:,'1/22/20'::].columns

In [14]:
confirmed_by_country = confirmed_df.groupby(['country']).sum()
deaths_by_country = deaths_df.groupby(['country']).sum()

In [15]:
@interact
def show_by_country(country=confirmed_by_country.index): 
    """Show confirmed and deaths per country"""
    # Number of confirmed cases
    confirmed_df_y=confirmed_by_country.query(f"country=='{country}'").loc[:,'1/22/20'::].values[0]
    # X values - dates
    confirmed_df_x=confirmed_by_country.query(f"country=='{country}'").loc[:,'1/22/20'::].columns
    # Number of deads
    deaths_df_y=deaths_by_country.query(f"country=='{country}'").loc[:,'1/22/20'::].values[0]
    # X values - dates
    deaths_df_x=deaths_by_country.query(f"country=='{country}'").loc[:,'1/22/20'::].columns
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=confirmed_df_x,y=confirmed_df_y,mode='lines+markers', name='confirmed'))
    fig.add_trace(go.Scatter(x=deaths_df_x,y=deaths_df_y,mode='markers',name='deaths'))
    fig.show()

interactive(children=(Dropdown(description='country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

In [16]:
# Plot top 10 worst hit countries

In [16]:
#Confirmed Cases
top_10_confirmed = country_df.sort_values(by="confirmed", ascending=False).head(10)
fig2 = px.scatter(top_10_confirmed, 
                 x="confirmed", y="deaths",size="confirmed", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
fig2.show()

In [17]:
#Dead Cases
top_10_death = country_df.sort_values(by="deaths", ascending=False).head(10)
fig3 = px.scatter(top_10_death, 
                 x="deaths", y="confirmed",size="deaths", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
fig3.show()

In [18]:
#Active Cases
top_10_active = country_df.sort_values(by="active", ascending=False).head(10)
fig4 = px.scatter(top_10_active, 
                 x="active", y="confirmed",size="active", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
fig4.show()

In [19]:
#Active Cases
top_10_recovered = country_df.sort_values(by="recovered", ascending=False).head(10)
fig5 = px.scatter(top_10_recovered, 
                 x="recovered", y="confirmed",size="recovered", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
fig5.show()

In [20]:
country_df.sort_values(by="mortality_rate", ascending=False).head()

Unnamed: 0,country_region,last_update,lat,long_,confirmed,deaths,recovered,active,incident_rate,people_tested,people_hospitalized,mortality_rate,uid,iso3
185,Yemen,2020-10-01 02:23:36,15.55273,48.51639,2034.0,587.0,1286.0,161.0,6.81956,,,28.85939,887,YEM
104,MS Zaandam,2020-10-01 02:23:36,,,9.0,2.0,,7.0,,,,22.22222,8888,
85,Italy,2020-10-01 02:23:36,41.8719,12.5674,314861.0,35894.0,227704.0,51263.0,520.75997,,,11.39995,380,ITA
113,Mexico,2020-10-01 02:23:36,23.6345,-102.5528,743216.0,77646.0,628007.0,37563.0,581.58127,,,10.4473,484,MEX
184,Western Sahara,2020-10-01 02:23:36,24.2155,-12.8858,10.0,1.0,8.0,1.0,1.67412,,,10.0,732,ESH


In [21]:
#Mortality Rate
top_10_mortality_rate = country_df.sort_values(by="mortality_rate", ascending=False).head(10)
fig6 = px.scatter(top_10_mortality_rate, 
                 x="mortality_rate", y="incident_rate",size="mortality_rate", color="country_region",
                 hover_name="country_region", log_x=True, size_max=60)
fig6.show()

In [22]:
country_df.head()

Unnamed: 0,country_region,last_update,lat,long_,confirmed,deaths,recovered,active,incident_rate,people_tested,people_hospitalized,mortality_rate,uid,iso3
0,Afghanistan,2020-10-01 02:23:36,33.93911,67.70995,39268.0,1458.0,32789.0,5021.0,100.87252,,,3.71295,4,AFG
1,Albania,2020-10-01 02:23:36,41.1533,20.1683,13649.0,387.0,7847.0,5415.0,474.28591,,,2.83537,8,ALB
2,Algeria,2020-10-01 02:23:36,28.0339,1.6596,51530.0,1736.0,36174.0,13620.0,117.51146,,,3.36891,12,DZA
3,Andorra,2020-10-01 02:23:36,42.5063,1.5218,2050.0,53.0,1432.0,565.0,2653.2065,,,2.58537,20,AND
4,Angola,2020-10-01 02:23:36,-11.2027,17.8739,4972.0,183.0,1941.0,2848.0,15.12797,,,3.68061,24,AGO


In [23]:
country_df.columns

Index(['country_region', 'last_update', 'lat', 'long_', 'confirmed', 'deaths',
       'recovered', 'active', 'incident_rate', 'people_tested',
       'people_hospitalized', 'mortality_rate', 'uid', 'iso3'],
      dtype='object')

In [24]:
import folium

In [25]:
# Filter only those that have lat and long_ values
country_df = country_df[~country_df['lat'].isnull()]
country_df = country_df[~country_df['long_'].isnull()]

In [26]:
max_confirmed = country_df['confirmed'].max()
min_confirmed = country_df['confirmed'].min()
print(max_confirmed)
print(min_confirmed)
print(max_confirmed-min_confirmed)
normalized_radius = (39145-min_confirmed)/(max_confirmed-min_confirmed)
print(normalized_radius,39145-min_confirmed/(max_confirmed-min_confirmed))

7229723.0
10.0
7229713.0
0.005413077946524295 39144.99999861682


In [28]:
m = folium.Map()

for country_region, lat, long_, confirmed, deaths, recovered,mortality_rate in zip(country_df['country_region'],
                                      country_df['lat'],
                                      country_df['long_'],
                                      country_df['confirmed'],
                                      country_df['deaths'],
                                      country_df['recovered'],
                                      country_df['mortality_rate']
                                     ):

    folium.Circle(
        #Normalize and augment 
        radius=((confirmed-min_confirmed)/(max_confirmed-min_confirmed))*1000000,
        location=[lat, long_],
        popup=country_region,
        color='crimson',
        fill=True,
        fill_color = 'red',
        tooltip = f"Country:{country_region}<br/>Confirmed:{confirmed}<br/>Death:{deaths}<br/>Death Rate:{mortality_rate}"
    ).add_to(m)

m

## Forecasting

In [None]:
def get_country_data(country_name, df):
    """Given a country name, return the Series with information"""    
    results = None    
    if(country_name in df.index):  
        results = df.loc[country_name]
        # Convert index to DateTime
        results.index = pd.to_datetime(results.index)
    return results

In [None]:
# Display confirmed cases of Canada (sample)
series = get_country_data("World", world_confirmed_df)
series.plot()
pyplot.show()

In [None]:
def calculate_mse(series,train_size=65, arima_order=(10,1,0)):
    """Given a series, calculate the mse given the arima_order"""
    # Split train, test and predict
    X = series.values
    # Split train/test in rouhgly 65%/35%
    up_to = int(len(X) * 0.65)
    train, test = X[0:up_to], X[up_to:len(X)]
    # initial historical (train) data
    history = [x for x in train]
    # To keep track of predictions
    predictions = []
    for t in tqdm(range(len(test))):
        model = ARIMA(history, order=(10,1,0))
        model_fit = model.fit(disp=0)	
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        actual = test[t] 
        # Append real value to history 
        history.append(actual)
        # Uncomment if print desired 
        #print(f'Prediction={yhat}, Expected={actual}')
    # Calculate MSE    
    error = mean_squared_error(test, predictions)
    print('Test MSE: %.3f' % error)
    # plot
    pyplot.plot(test)
    pyplot.plot(predictions, color='red')
    pyplot.show()