![CRISP_DM](../reports/figures/CRISP_DM.png)

# Business Understanding
1. We would like to track COVID 19 spread across different countries with personal local information.
2. We would like to get deep insights of local development across countries as general information is not adequate.

# Goals
1. Automate the full pipeline of gathering the data, preparing, assess model, evaluate and deploy in as few clicks as possible

# Data understanding

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import plotly.graph_objects as go
import plotly
import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
import numpy as np
from datetime import datetime

# Web scrapping : 
# Confirmed : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
# Recovered : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv
# Deaths : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv

In [8]:
# Get the data from the URL mentioned above
def getLatestData(info_type):
    if info_type == "confirmed":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
    elif info_type == "deaths":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
    elif info_type == "recovered":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
    soup = BeautifulSoup(response.content, 'html.parser')
    html_table=soup.find('table')
    all_rows=html_table.find_all('tr')
    jh_data_list=[]
    for pos,rows in enumerate(all_rows):
        if pos==0:
            header_list = [each_col.get_text(strip=True) for each_col in rows.find_all('th')]
        else:
            col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
            jh_data_list.append(col_list)
    return jh_data_list,header_list
    

In [9]:
# Prepare the data for visualization and modelling
def prepareDataframe(jh_data_list,header_list):
    header_list.insert(0,'index')
    jh_data_df=pd.DataFrame(jh_data_list)
    jh_data_df.columns=header_list
    #jh_data_df.head()
    time_idx=jh_data_df.columns[5:]
    country_list=jh_data_df['Country/Region']
    jh_data_transformed_df = pd.DataFrame({'date':time_idx})
    for each in country_list:
        jh_data_transformed_df[each] = np.array(jh_data_df[jh_data_df['Country/Region']==each].iloc[:,5::].astype(int).sum(axis=0))
    #jh_data_transformed_df.tail()
    time_idx=[datetime.strptime( each,"%m/%d/%y") for each in jh_data_transformed_df.date] # convert to datetime
    time_str=[each.strftime('%Y-%m-%d') for each in time_idx] # convert back to date ISO norm (str)
    jh_data_transformed_df['date']=time_idx
    return jh_data_transformed_df

# Data fetch and preparation

In [10]:
jh_confirmed_list,header_list = getLatestData("confirmed")
jh_confirmed_df = prepareDataframe(jh_confirmed_list,header_list)
jh_confirmed_df

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
4,2020-01-26,0,0,0,0,0,0,0,0,4,...,0,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,2020-09-06,38398,10255,46364,1215,2965,95,478792,44783,26321,...,349500,1679,43587,53289,1049,26127,10,1987,12776,6837
229,2020-09-07,38494,10406,46653,1261,2981,95,488007,44845,26373,...,352451,1693,43893,54350,1049,26779,10,1989,12836,7298
230,2020-09-08,38520,10553,46938,1261,3033,95,500034,44953,26465,...,354932,1712,44281,55563,1054,27363,10,1994,12952,7388
231,2020-09-09,38544,10704,47216,1301,3092,95,512293,45152,26524,...,357613,1741,44930,56751,1059,27919,10,1999,13112,7429


In [11]:
jh_deaths_list,header_list = getLatestData("deaths")
jh_deaths_df = prepareDataframe(jh_deaths_list,header_list)
jh_deaths_df

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-01-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,2020-09-06,1412,316,1556,53,117,3,9859,897,762,...,41640,45,347,428,35,181,1,572,295,206
229,2020-09-07,1415,319,1562,53,120,3,10129,900,770,...,41643,45,352,436,35,184,1,573,295,210
230,2020-09-08,1418,321,1571,53,124,3,10405,903,781,...,41675,45,358,444,35,190,1,576,297,218
231,2020-09-09,1420,322,1581,53,126,3,10658,905,788,...,41683,45,366,452,35,192,1,576,300,222


In [12]:
jh_recovered_list,header_list = getLatestData("recovered")
jh_recovered_df = prepareDataframe(jh_recovered_list,header_list)
jh_recovered_df

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-01-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,2020-09-06,30537,6106,32745,928,1198,91,349132,40089,22465,...,1824,1459,41225,42006,815,16843,8,1200,11674,5345
229,2020-09-07,30557,6186,32985,934,1215,91,357388,40121,22602,...,1824,1466,41531,43753,853,17270,8,1201,11748,5455
230,2020-09-08,30715,6239,33183,934,1215,91,366590,40592,22723,...,1827,1476,41594,44435,868,17779,8,1203,11787,5477
231,2020-09-09,31048,6284,33379,938,1245,91,382490,41023,22861,...,1831,1478,42212,45318,890,18466,8,1209,11839,5542


In [13]:
country_list=jh_confirmed_df.columns.difference(['date'])
country_dropdown=[]
for country in country_list:
    country_dropdown.append({'label':country,'value':country})
#country_dropdown    

In [14]:
jh_confirmed_df.to_csv('../data/processed/COVID_small_flat_confirmed_table.csv',sep=';',index=False)
jh_recovered_df.to_csv('../data/processed/COVID_small_flat_recovered_table.csv',sep=';',index=False)
jh_deaths_df.to_csv('../data/processed/COVID_small_flat_deaths_table.csv',sep=';',index=False)

In [15]:
x=jh_confirmed_df['India'][jh_confirmed_df.index[-1]]
type(x)

numpy.int64

# Dashboard

# Daily cases

In [1]:
confirmed_fig = go.Figure()
confirmed_fig.add_trace(go.Bar(x=jh_confirmed_df['date'],
                                y=jh_confirmed_df['Afghanistan'],
                                orientation = 'v',marker_color = 'grey'))

## defines the overall layout properties
confirmed_fig.update_layout(
    xaxis_title="Time",
    yaxis_title="No. of cases",
    title = "Daily confirmed cases",
    xaxis_type='category'
)
confirmed_fig.update_layout(
    hoverlabel=dict(
        bgcolor="white", 
        font_size=16
    )
)
confirmed_fig.update_xaxes(nticks = 15)
confirmed_fig.update_yaxes(showspikes=True)
confirmed_fig.show()

NameError: name 'go' is not defined

In [2]:
recovered_fig = go.Figure()
recovered_fig.add_trace(go.Bar(x=jh_recovered_df['date'],
                                y=jh_recovered_df['Afghanistan'],
                                orientation = 'v',marker_color = 'grey'))

## defines the overall layout properties
recovered_fig.update_layout(
    xaxis_title="Time",
    yaxis_title="No. of cases",
    title = "Daily recovered cases",
    xaxis_type='category'
)
recovered_fig.update_layout(
    hoverlabel=dict(
        bgcolor="white", 
        font_size=16
    )
)
recovered_fig.update_xaxes(nticks = 15)
recovered_fig.update_yaxes(showspikes=True)
recovered_fig.show()

NameError: name 'go' is not defined

In [12]:
deceased_fig = go.Figure()
deceased_fig.add_trace(go.Bar(x=jh_deaths_df['date'],
                                y=jh_deaths_df['Afghanistan'],
                                orientation = 'v',marker_color = 'grey'))

## defines the overall layout properties
deceased_fig.update_layout(
    xaxis_title="Time",
    yaxis_title="No. of cases",
    title = "Daily deaths",
    xaxis_type='category'
)
deceased_fig.update_layout(
    hoverlabel=dict(
        bgcolor="white", 
        font_size=16
    )
)
deceased_fig.update_xaxes(nticks = 15)
deceased_fig.update_yaxes(showspikes=True)
deceased_fig.show()

In [13]:
deceased_recovered_fig = go.Figure()
deceased_recovered_fig.add_trace(go.Scatter(x=jh_deaths_df['date'],
                                y=jh_deaths_df['Afghanistan'],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='Deceased'
                                 )
                     )
deceased_recovered_fig.add_trace(go.Scatter(x=jh_recovered_df['date'],
                                y=jh_recovered_df['Afghanistan'],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='Recovered'
                                 )
                     )

## defines the overall layout properties
deceased_recovered_fig.update_layout(
    xaxis_title="Time",
    yaxis_title="No. of cases",
    title = "Daily deaths vs recovered",
    xaxis_type='category'
)
deceased_recovered_fig.update_layout(
    hoverlabel=dict(
        bgcolor="white", 
        font_size=16
    )
)
deceased_recovered_fig.update_xaxes(nticks = 15,showspikes=True,spikecolor="grey")
deceased_recovered_fig.update_yaxes(showspikes=True,spikecolor="grey")
deceased_recovered_fig.show()

In [14]:
confirmed_recovered_fig = go.Figure()
confirmed_recovered_fig.add_trace(go.Scatter(x=jh_confirmed_df['date'],
                                y=jh_confirmed_df['Afghanistan'],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='Confirmed'
                                 )
                     )
confirmed_recovered_fig.add_trace(go.Scatter(x=jh_recovered_df['date'],
                                y=jh_recovered_df['Afghanistan'],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='Recovered'
                                 )
                     )

## defines the overall layout properties
confirmed_recovered_fig.update_layout(
    xaxis_title="Time",
    yaxis_title="No. of cases",
    title = "Daily confirmed vs recovered",
    xaxis_type='category'
)
confirmed_recovered_fig.update_layout(
    hoverlabel=dict(
        bgcolor="white", 
        font_size=16
    )
)
confirmed_recovered_fig.update_xaxes(nticks = 15,showspikes=True,spikecolor="grey")
confirmed_recovered_fig.update_yaxes(showspikes=True,spikecolor="grey")
confirmed_recovered_fig.show()

In [15]:
external_stylesheets = [dbc.themes.BOOTSTRAP]
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
    html.H1('COVID 19 cases', style={"textAlign":'center'}),
    html.Div([
        
        html.H2('Choose Country'),
        
        html.Div([
            dbc.Row([
                
                dbc.Col(dcc.Dropdown(id='country_drop_down', options=country_dropdown, value ='Afghanistan')),
                dbc.Col(dbc.RadioItems(id='graph_type', options=[{'label': i, 'value': i} for i in ['Linear', 'Log']],
                                       value='Linear',style={"display":'inline-block'}))
            ]),
            dbc.Row([
                
               dbc.Col(
                    dbc.Row([
                        dbc.Col(dbc.Card([
                            dbc.CardHeader("Total statistics"),
                            dbc.CardBody([
                                html.Div(id="total_stats")
                            ])], 
                            style={"width": "18rem",'backgroundColor':'#EAF2F8','marginTop':'28%'})),
                        dbc.Col(dbc.Card([
                            dbc.CardHeader("Today's statistics"),
                            dbc.CardBody([
                                html.Div(id="today_stats")
                            ])], 
                            style={"width": "18rem",'backgroundColor':'#EAF2F8','marginTop':'28%'}))
                    ])),
                   dbc.Col(dcc.Graph(figure=confirmed_fig, id='confirmed_fig_id'))
            ]),
            dbc.Row([
                
                dbc.Col(dcc.Graph(figure=recovered_fig, id='recovered_fig_id')),
                dbc.Col(dcc.Graph(figure=deceased_fig, id='deceased_fig_id'))
            ]),
            dbc.Row([
                
                dbc.Col(dcc.Graph(figure=deceased_recovered_fig, id='deceased_recovered_fig_id')),
                dbc.Col(dcc.Graph(figure=confirmed_recovered_fig, id='confirmed_recovered_fig_id'))
            ])
            
        ])

    ])
],style={"marginLeft":'2%',"marginRight":'2%',"marginTop":'2%'})

In [16]:
@app.callback(
    Output('total_stats', 'children'),
    [Input('country_drop_down', 'value')])
def update_metrics(country_name):
    total = jh_confirmed_df[country_name][jh_confirmed_df.index[-1]]
    recovered = jh_recovered_df[country_name][jh_recovered_df.index[-1]]
    deaths =  jh_deaths_df[country_name][jh_deaths_df.index[-1]]
    return html.Div([
        html.H4("Infected     : "+str(total)),
        html.H4("Recovered    : "+str(recovered)),
        html.H4("Deaths       : "+str(deaths))
    ])

@app.callback(
    Output('today_stats', 'children'),
    [Input('country_drop_down', 'value')])
def update_metrics(country_name):
    total = jh_confirmed_df[country_name][jh_confirmed_df.index[-1]] - jh_confirmed_df[country_name][jh_confirmed_df.index[-1]-1]
    recovered = jh_recovered_df[country_name][jh_recovered_df.index[-1]] - jh_recovered_df[country_name][jh_recovered_df.index[-1]-1]
    deaths =  jh_deaths_df[country_name][jh_deaths_df.index[-1]] - jh_deaths_df[country_name][jh_deaths_df.index[-1]-1]
    return html.Div([
        html.H4("Infected     : "+str(total)),
        html.H4("Recovered    : "+str(recovered)),
        html.H4("Deaths       : "+str(deaths))
    ])


In [17]:
@app.callback(
    Output('confirmed_fig_id', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('graph_type', 'value')])
def update_figure(country_name,graph_type):
    return {
            'data': [dict(x=jh_confirmed_df['date'],
                                y=jh_confirmed_df[country_name],
                                orientation = 'v',marker_color = 'grey',type='bar')],
            'layout': dict (
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                        'title' : 'Time'
                        
                      },
                yaxis={
                       'range':'[1.1,5.5]',
                       'title':"No. of cases",
                       'type':'linear' if graph_type == 'Linear' else 'log'
                      },
                title = "Daily confirmed cases for "+country_name
        )
    }
@app.callback(
    Output('recovered_fig_id', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('graph_type', 'value')])
def update_figure(country_name,graph_type):
    return {
            'data': [dict(x=jh_recovered_df['date'],
                                y=jh_recovered_df[country_name],
                                orientation = 'v',marker_color = 'grey',type='bar')],
            'layout': dict (
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                        'title' : 'Time'
                        
                      },
                yaxis={
                       'range':'[1.1,5.5]',
                       'title':"No. of cases",
                       'type':'linear' if graph_type == 'Linear' else 'log'
                      },
                title = "Daily recovered cases for "+country_name
        )
    }
@app.callback(
    Output('deceased_fig_id', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('graph_type', 'value')])
def update_figure(country_name,graph_type):
    return {
            'data': [dict(x=jh_deaths_df['date'],
                                y=jh_deaths_df[country_name],
                                orientation = 'v',marker_color = 'grey',type='bar')],
            'layout': dict (
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                        'title' : 'Time'
                        
                      },
                yaxis={
                       'range':'[1.1,5.5]',
                       'title':"No. of cases",
                       'type':'linear' if graph_type == 'Linear' else 'log'
                      },
                title = "Daily deaths for "+country_name
        )
    }

In [18]:
@app.callback(
    Output('deceased_recovered_fig_id', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('graph_type', 'value')])
def update_figure(country_name,graph_type):
    return {
            'data': [dict(x=jh_deaths_df['date'],
                                y=jh_deaths_df[country_name],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='New deaths'),
                    dict(x=jh_recovered_df['date'],
                                y=jh_recovered_df[country_name],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='New recoveries')],
            'layout': dict (
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                        'title' : 'Time'
                        
                      },
                yaxis={
                       'range':'[1.1,5.5]',
                       'title':"No. of cases",
                       'type':'linear' if graph_type == 'Linear' else 'log'
                      },
                title = "New deaths vs new recoveries for "+country_name
        )
    }

In [19]:
@app.callback(
    Output('confirmed_recovered_fig_id', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('graph_type', 'value')])
def update_figure(country_name,graph_type):
    return {
            'data': [dict(x=jh_confirmed_df['date'],
                                y=jh_confirmed_df[country_name],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='new confirmed'),
                    dict(x=jh_recovered_df['date'],
                                y=jh_recovered_df[country_name],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4, 
                                name='new recovery')],
            'layout': dict (
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                        'title' : 'Time'
                        
                      },
                yaxis={
                       'range':'[1.1,5.5]',
                       'title':"No. of cases",
                       'type':'linear' if graph_type == 'Linear' else 'log'
                      },
                title = "Newly confirmed vs newly recovered for "+country_name
        )
    }

In [20]:
app.run_server(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


# Semi automated

In [21]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

In [None]:
time_idx=pd_raw.columns[4:]
df_plot = pd.DataFrame({
    'date':time_idx})
#df_plot.head()
for each in country_list:
    df_plot[each]=np.array(pd_raw[pd_raw['Country/Region']==each].iloc[:,4::].sum(axis=0))
time_idx=[datetime.strptime( each,"%m/%d/%y") for each in df_plot.date] # convert to datetime
time_str=[each.strftime('%Y-%m-%d') for each in time_idx] # convert back to date ISO norm (str)
df_plot['date']=time_idx
#df_plot.head()