# One run full walkthorugh
* Walkthough with full data set
* Functional calling of all notebooks
* Single click walkthourgh with all scripts

In [2]:
## Setup a basepath for where notebooks are located
## incase kernel is opened in other directory, this will set it to default notbook directory

import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")
    
'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: Enterprise_Data_Science'

In [3]:
os.curdir

'.'

## 1. Update all data

In [4]:
# %load src/data/get_data.py

import pandas as pd
import numpy as np

import subprocess
import os

from datetime import datetime

import requests
import json

def get_john_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    # clone git repo
    git_repo = 'https://github.com/CSSEGISandData/COVID-19.git'
    git_clone = subprocess.Popen( "git clone " + git_repo ,
                             cwd = os.path.dirname( 'data/raw/' ),
                             shell = True,
                             stdout = subprocess.PIPE,
                             stderr = subprocess.PIPE )
    (out, error) = git_clone.communicate()

    print('out:', out)
    print('error:', error)
    
    # pull if repo is already cloned
    git_pull = subprocess.Popen( "git pull" ,
                         cwd = os.path.dirname( 'data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()

    print("Error : " + str(error))
    print("out : " + str(out))

def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    
    # save data into csv file
    directory = 'data/raw/NPGEO'
    if not os.path.exists(directory):
        os.mkdir(directory)
        
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';')
    print('Number of rows for regional Germany: '+str(pd_full_list.shape[0]))
    
if __name__ == '__main__':
    get_john_hopkins()
    get_current_data_germany()

out: b''
error: b"fatal: destination path 'COVID-19' already exists and is not an empty directory.\n"
Error : b'From https://github.com/CSSEGISandData/COVID-19\n   2630c05f..37b2fa02  master               -> origin/master\n * [new branch]        patch-08-18-us-daily -> origin/patch-08-18-us-daily\n   86106a13..53de97a5  web-data             -> origin/web-data\n'
out : b'Updating 2630c05f..37b2fa02\nFast-forward\n csse_covid_19_data/README.md                       |    1 +\n .../csse_covid_19_daily_reports/08-18-2020.csv     | 7896 ++++++++++----------\n .../csse_covid_19_daily_reports/08-27-2020.csv     | 3951 ++++++++++\n .../csse_covid_19_daily_reports/08-28-2020.csv     | 3951 ++++++++++\n .../csse_covid_19_daily_reports_us/08-18-2020.csv  |  118 +-\n .../csse_covid_19_daily_reports_us/08-27-2020.csv  |   59 +\n .../csse_covid_19_daily_reports_us/08-28-2020.csv  |   59 +\n .../time_series_covid19_confirmed_US.csv           | 6682 ++++++++---------\n .../time_series_covid19_confirmed

## 2. Process pipeline

In [5]:
# %load src/data/process_JH_data.py

import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print('Number of rows stored: '+str(pd_relational_model.shape[0]))
    print('Last updated on: '+str(max(pd_relational_model.date)))
    
if __name__ == '__main__':
    store_relational_JH_data()

Number of rows stored: 58520
Last updated on: 2020-08-28 00:00:00


## 3. Data filtering and doubling rate calculation

In [6]:
# %load src/features/build_features.py

import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal

## Data Filtering
def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size
                           1) # degree of polynomial
    df_result[str(column+'_filtered')]=result
    
    return df_result

## Calculating doubling rate
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)

    return result


def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()


def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    #test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    #                  (pd_JH_data['country']=='Germany'))]

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())

the test slope is: [2.]
            date state  country  confirmed  confirmed_filtered  confirmed_DR  \
31895 2020-08-24    no  Germany   236122.0            236214.0    207.718414   
31896 2020-08-25    no  Germany   237583.0            237556.0    152.843207   
31897 2020-08-26    no  Germany   239010.0            239082.4    164.523315   
31898 2020-08-27    no  Germany   240571.0            240582.0    160.009817   
31899 2020-08-28    no  Germany   242126.0            242081.6    154.408858   

       confirmed_filtered_DR  
31895             191.490493  
31896             186.165472  
31897             165.679450  
31898             158.012866  
31899             160.430782  


In [7]:
pd_result_larg[pd_result_larg['country']=='Brazil'].tail()

Unnamed: 0,date,state,country,confirmed,confirmed_filtered,confirmed_DR,confirmed_filtered_DR
22875,2020-08-24,no,Brazil,3622861.0,3639631.4,177.963341,102.134829
22876,2020-08-25,no,Brazil,3669995.0,3675437.2,113.152671,100.019099
22877,2020-08-26,no,Brazil,3717156.0,3715241.2,77.840904,97.256439
22878,2020-08-27,no,Brazil,3761391.0,3760769.2,81.320422,87.122046
22879,2020-08-28,no,Brazil,3804803.0,3806297.2,85.824196,82.603435


## 4. Dashboard

In [25]:
# %load src/visualization/visualize.py

import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

external_stylesheets = ['https://codepen.io/chriddyp/pen/dZVMbK']
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div([
    
    html.H1('Applied Data Science on COVID-19 data', style={'text-align': 'center',
                                                                        'color': '#000099',
                                                                        'padding': 10,
                                                                        'background-color': '#f0f0f5',}),
    dcc.Markdown('''Goal of the project is to analyse and learn patterns in COVID-19 dataset from different 
    open sources. It covers the full walkthrough of: automated data gathering, data transformations, filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.''',
                 style={'color': '#000099', 
                        'text-align': 'center',
                        'padding': 1,
                        'background-color': '#f0f0f5',}),
    
    dcc.Markdown('''  ''',
                 style={'text-align': 'center',
                        'padding': 10,}),
    
    html.Div([  
        
        html.Div([
            dcc.Markdown('''__Multi-Select Country for visualization:__'''),
            dcc.Dropdown(
            id='country_drop_down',
            options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
            value=['India', 'US','Russia'], # which are pre-selected
            multi=True,
            style={'color': '#000099'})
        ],
                style={'color': '#000099', 'width': '100%', 'display': 'inline-block', 'padding-left': 10, 'font-size':18,'background-color': '#f0f0f5' })

    ]),
    
    dcc.Markdown(''' ''', style={'text-align': 'center', 'padding': 10,}),
    
    html.Div([  
        
        html.Div([
            dcc.Markdown('''__Select Timeline or doubling time:__'''),
            dcc.Dropdown(
            id='doubling_time',
            options=[
                {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
                {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
                {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
                {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
            ],
                value='confirmed',
                multi=False,
                style={'color': '#000099'}
            )],
            style={'color': '#000099', 'width': '100%', 'display': 'inline-block', 'padding-left': 10, 'font-size':18,'background-color': '#f0f0f5' })
                
    ]),
    
    dcc.Markdown(''' ''', style={'text-align': 'center', 'padding': 10,}),
    
    dcc.Graph(figure=fig, id='main_window_slope', style={'color': '#000099', 'background-color': '#f0f0f5'})
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])

def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)',
                'color' : '#000099',
                'tickfont': dict(size=14)
              }
    else:
        my_yaxis={'type': 'log', 
                  'title':'<b>Confirmed infected people (source johns hopkins csse)<b>',
                  'color' : '#000099',
                  'tickfont': dict(size=14)
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )
        

    return {
            'data': traces,
            'layout': dict (

                xaxis={'title':'<b>Timeline<b>',
                        'tickangle':-45,
                        'padding' : 10,
                        'nticks':20,
                        'tickfont':dict(size=14),
                        'color' : '#000099'
                      },

                yaxis=my_yaxis,
                legend=dict(orientation="h",
                            color="#000099",
                            yanchor="bottom",
                            y=1.02,
                            xanchor="right",
                            x=1),
                autosize=True,
                #height=768,
                #width=1360,
                plot_bgcolor = '#f0f0f5',
                paper_bgcolor = '#f0f0f5',
                font = {'color': '#000099'}
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, 
                   use_reloader=False,
                   host='127.0.0.1',
                   port=8050)


/Users/asmi/Desktop/trial/Enterprise_Data_Science
Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a product