# **COVID-19 Outbreak Analysis**

## Data Preprocessing

In [1]:
# All Imports Required Go Here

import requests
from datetime import datetime
from datetime import date
import os
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode

In [2]:
# Making sure that the plotly graphs and chloropleths can be seen
init_notebook_mode(connected=True)

In [3]:
# Data from the John Hopkins University Dataset on GitHub
# https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

# Defining the variables required
filenames = ['time_series_covid19_confirmed_global.csv',
             'time_series_covid19_deaths_global.csv',
             'time_series_covid19_recovered_global.csv']

url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

# Making the main dataframes required for the analysis
confirmed_global = pd.read_csv(url + filenames[0])
deaths_global = pd.read_csv(url + filenames[1])
recovered_global = pd.read_csv(url + filenames[2])
country_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')

In [4]:
# Simple Data Cleaning - Removing and renaming the Columns

# Removing the Province/State column, as it is pretty much not of any use
confirmed_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
deaths_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
recovered_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)

# Renaming the columns for easier access
confirmed_global.rename(columns = {"Country/Region": "country"}, inplace = True)
deaths_global.rename(columns = {"Country/Region": "country"}, inplace = True)
recovered_global.rename(columns = {"Country/Region": "country"}, inplace = True)

country_cases.rename(columns = {
    "Country_Region" : "country",
    "Last_Update": "last",
    "Confirmed": "confirmed",
    "Deaths": "deaths",
    "Recovered" : "recovered",
    "Active" : "active",
    "Mortality_Rate": "mortality"
}, inplace = True)

In [5]:
# Removing some duplicate values from the table
confirmed_global = confirmed_global.groupby(['country'], as_index = False).sum()
deaths_global = deaths_global.groupby(['country'], as_index = False).sum()
recovered_global = recovered_global.groupby(['country'], as_index = False).sum()

In [6]:
# This value is being changed as there was an error in the original dataset that had to be modified
confirmed_global.at[178, '5/20/20'] = 251667

In [7]:
confirmed_global.head()

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,6/4/20,6/5/20,6/6/20,6/7/20,6/8/20,6/9/20,6/10/20,6/11/20,6/12/20,6/13/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,18054,18969,19551,20342,20917,21459,22142,22890,23546,24102
1,Albania,0,0,0,0,0,0,0,0,0,...,1197,1212,1232,1246,1263,1299,1341,1385,1416,1464
2,Algeria,0,0,0,0,0,0,0,0,0,...,9831,9935,10050,10154,10265,10382,10484,10589,10698,10810
3,Andorra,0,0,0,0,0,0,0,0,0,...,852,852,852,852,852,852,852,852,853,853
4,Angola,0,0,0,0,0,0,0,0,0,...,86,86,88,91,92,96,113,118,130,138


In [8]:
deaths_global.head()

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,6/4/20,6/5/20,6/6/20,6/7/20,6/8/20,6/9/20,6/10/20,6/11/20,6/12/20,6/13/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,300,309,327,357,369,384,405,426,446,451
1,Albania,0,0,0,0,0,0,0,0,0,...,33,33,34,34,34,34,34,35,36,36
2,Algeria,0,0,0,0,0,0,0,0,0,...,681,690,698,707,715,724,732,741,751,760
3,Andorra,0,0,0,0,0,0,0,0,0,...,51,51,51,51,51,51,51,51,51,51
4,Angola,0,0,0,0,0,0,0,0,0,...,4,4,4,4,4,4,4,5,5,6


In [9]:
recovered_global.head()

Unnamed: 0,country,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,6/4/20,6/5/20,6/6/20,6/7/20,6/8/20,6/9/20,6/10/20,6/11/20,6/12/20,6/13/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,1585,1762,1830,1875,2171,2651,3013,3326,3928,4201
1,Albania,0,0,0,0,0,0,0,0,0,...,898,910,925,938,945,960,980,1001,1034,1039
2,Algeria,0,0,0,0,0,0,0,0,0,...,6297,6453,6631,6717,6799,6951,7074,7255,7322,7420
3,Andorra,0,0,0,0,0,0,0,0,0,...,738,741,741,744,751,757,759,780,781,781
4,Angola,0,0,0,0,0,0,0,0,0,...,18,21,24,24,38,38,40,41,42,61


In [10]:
country_cases.head()

Unnamed: 0,country,last,Lat,Long_,confirmed,deaths,recovered,active,Incident_Rate,People_Tested,People_Hospitalized,mortality,UID,ISO3
0,Australia,2020-06-14 17:33:10,-25.0,133.0,7320.0,102.0,6838.0,380.0,28.751321,,,1.393443,36,AUS
1,Austria,2020-06-14 17:33:10,47.5162,14.5501,17109.0,677.0,16059.0,373.0,189.964914,,,3.956982,40,AUT
2,Canada,2020-06-14 17:33:10,60.001,-95.001,100196.0,8210.0,60349.0,31638.0,264.678753,,,8.19394,124,CAN
3,China,2020-06-14 17:33:10,30.5928,114.3055,84296.0,4638.0,79476.0,182.0,6.001098,,,5.50204,156,CHN
4,Denmark,2020-06-14 17:33:10,56.2639,9.5018,12393.0,597.0,11268.0,528.0,213.960042,,,4.817236,208,DNK


## Data Visualization

### 5 Worst Affected Countries

In [11]:
# Making a dataframe with the country data in sorted order
country_cases_sorted = country_cases.sort_values('confirmed', ascending = False)
country_cases_sorted.index = [x for x in range(len(country_cases_sorted))]

# Plotting the worst affected countries side by side for a direct comparison
fig = px.bar(country_cases_sorted.head(), x = 'country', y = 'confirmed', color = 'confirmed',
            labels = {'country': 'Country', 'confirmed': 'Confirmed Cases'}, template = 'plotly_dark')
fig.show()

### Visualization of the entire situation with chloropleths

In [12]:
'''
Defining a function to find current date as to use in graph
takes in no parameters 
returns current date in the fomrat dd-mm-yyyy
'''

def curr_date(): 
    t = date.today()
    date1 = t.strftime("%d-%m-%Y")
    return date1

'''
Defining a function to plot a global chloropleth
Takes in the following parameters:
1.based_on: Graph is plotted and scaled on this parameter
2.title: Provides graph title
3.bar_title: Provides the colorbar title
4.color_scale: Gives the color on which the map is being plotted
'''

def chloropleth(based_on,title,bar_title,color_scale):
    date = curr_date()
    fig = go.Figure(data = go.Choropleth(
        locations = country_cases['ISO3'],
        z = country_cases[based_on],
        text = country_cases['country'],
        colorscale = color_scale,
        autocolorscale = False,
        reversescale = False,
        marker_line_color = 'darkgray',
        marker_line_width = 0.5,
        colorbar_tickprefix = '#',
        colorbar_title = bar_title,
    ))

    fig.update_layout(
        title_text = f'COVID-19 - {title} AS OF {date}',
        geo=dict(
            showframe = True,
            showcoastlines = False,
            projection_type = 'equirectangular'
        ),
        annotations = [dict(
            x = 0.55,
            y = 0.1,
            xref = 'paper',
            yref = 'paper',
            text = 'Source: John Hopkins University',
            showarrow = False
        )]
    )
    
    fig.show()

In [13]:
# Creating a dictionary with keys as based_on parameter with its value being a list of other required parameters for the chloropleth function
chloropleths = {
    "confirmed":["CONFIRMED CASES","No. of confirmed cases","oranges_r"],
    "Incident_Rate":["INCIDENT RATE",'INCIDENT RATE <br>No. of cases per person year','blues_r'],
    "deaths":["NUMBER OF DEATHS","No. of deaths","delta"],
    "recovered":["RECOVERED CASES","No. of recovered cases","teal"]
}

In [14]:
# Plotting the confirmed cases chloropleth
graph = 'confirmed'
chloropleth(graph,chloropleths[graph][0],chloropleths[graph][1],chloropleths[graph][2])

In [15]:
# Plotting the incident rate chloropleth
graph = 'Incident_Rate'
chloropleth(graph,chloropleths[graph][0],chloropleths[graph][1],chloropleths[graph][2])

In [16]:
# Plotting the deaths chloropleth
graph = 'deaths'
chloropleth(graph,chloropleths[graph][0],chloropleths[graph][1],chloropleths[graph][2])

In [17]:
# Plotting the recovered cases chloropleth
graph = 'recovered'
chloropleth(graph,chloropleths[graph][0],chloropleths[graph][1],chloropleths[graph][2])

### Time series analysis for the 5 worst affected countries

### 1. Confirmed Cases

In [18]:
'''
This function takes a country as a parameter and returns a dataframe that contains the number of new confirmed cases each day up to the current date
The resultant data frame contains 2 columns:
1. date: Has the dates
2. cases: Has the number of new confirmed cases on that particular date
'''

def get_new_cases(country):
    time_series = confirmed_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series = time_series.drop(['country'], axis = 1)
    time_series_cases = time_series['cases'].diff()
    time_series_cases = time_series_cases.replace(np.nan, 0)
    time_series = pd.DataFrame(data = {
        'date': time_series.date,
        'cases': time_series_cases
    })
    time_series.index = [x for x in range(len(time_series))]
    return time_series

In [19]:
'''
This function takes a time series object as a parameter and plots it on a bar graph
The time series object MUST be formatted in the right manner for this function to work
The get_new_cases function can be used to obtain a time series of the right format
Function can be used universally to plot any time series object
Returns a plotly express object which you can then display with the show method of the object
'''

def get_plot(time_series):
    fig = px.bar(time_series, x = 'date', y = 'cases', template = 'plotly_dark')
    return fig

'''
This function takes the name of a country, the function to use to get the time series(must be either get_new_cases, or get_new_deaths), and the graph title and plots either one of the following:
1. If the function is get_new_cases, then a plot of the new confirmed cases per day v/s the date is plotted for the country specified
2. If the function is get_new_deaths, then a plot of the new deaths per day v/s the date is plotted for the country specified
'''

def plot_timeseries(country_name, func_name, title):
    new_confirmed_cases = func_name(country_name)
    fig = get_plot(new_confirmed_cases)
    fig.update_layout(
        title = title,
        xaxis_title = 'Date',
        yaxis_title = f'Number of {"deaths" if "deaths" in title else "new cases"}'
    )
    fig.show()

In [20]:
country_name = country_cases_sorted['country'][0]
plot_timeseries(country_name, get_new_cases, f'New confirmed cases per day in {country_name}')

In [21]:
country_name = country_cases_sorted['country'][1]
plot_timeseries(country_name, get_new_cases, f'New confirmed cases per day in {country_name}')

In [22]:

country_name = country_cases_sorted['country'][2]
plot_timeseries(country_name, get_new_cases, f'New confirmed cases per day in {country_name}')

In [23]:
country_name = country_cases_sorted['country'][3]
plot_timeseries(country_name, get_new_cases, f'New confirmed cases per day in {country_name}')

In [24]:
country_name = country_cases_sorted['country'][4]
plot_timeseries(country_name, get_new_cases, f'New confirmed cases per day in {country_name}')

### 2. Deaths

In [25]:
'''
This function takes a country as a parameter and returns a dataframe that contains the number of new deaths each day up to the current date
The resultant data frame contains 2 columns:
1. date: Has the dates
2. cases: Has the number of new deaths on that particular date
'''

def get_new_deaths(country):
    time_series = deaths_global.melt(id_vars = ['country'], var_name = 'date', value_name = 'cases')
    time_series = time_series[time_series['country'] == country]
    time_series =time_series.drop(['country'], axis = 1)
    time_series_cases = time_series['cases'].diff()
    time_series_cases = time_series_cases.replace(np.nan, 0)
    time_series = pd.DataFrame(data = {
        'date': time_series.date,
        'cases': time_series_cases
    })
    time_series.index = [x for x in range(len(time_series))]
    return time_series

In [26]:
country_name = country_cases_sorted['country'][0]
plot_timeseries(country_name, get_new_deaths, f'New deaths per day in {country_name}')

In [27]:
country_name = country_cases_sorted['country'][1]
plot_timeseries(country_name, get_new_deaths, f'New deaths per day in {country_name}')

In [28]:
country_name = country_cases_sorted['country'][2]
plot_timeseries(country_name, get_new_deaths, f'New deaths per day in {country_name}')

In [29]:
country_name = country_cases_sorted['country'][3]
plot_timeseries(country_name, get_new_deaths, f'New deaths per day in {country_name}')

In [30]:
country_name = country_cases_sorted['country'][4]
plot_timeseries(country_name, get_new_deaths, f'New deaths per day in {country_name}')

In [31]:
!pip install jovian --upgrade --quiet

In [32]:

import jovian

<IPython.core.display.Javascript object>

In [33]:
jovian.commit(project = 'covid')

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..[0m
[jovian] Detected Kaggle notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ml/ ):[0m
API KEY: ········
[jovian] Uploading notebook to https://jovian.ml/anandrajaram21/covid[0m


<IPython.core.display.Javascript object>