### One run full walktrhough
<p>Do the full walk through on the large data set. </p>
<p>Refactor the source code and bring it to individual scripts.<p/>
<p>Ensure a full run with one click.</p>

In [1]:
# all imports
import os

In [2]:
## check some parameters
## depending where you launch your notebook, the relative path might not work
## you should start the notebook server from your base path
## when opening the notebook, typically your path will be ../ads_covid-19/notebooks

if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: EDS_Covid_19_Dashboard'

### Step 1: Update all data

In [3]:
# %load src/data/get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_johns_hopkins():
    '''GEt full data from git pull request stored in csv file.'''
    git_pull = subprocess.Popen( "git pull" ,
                         cwd = os.path.dirname('data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))
    


def get_current_data_germany():
    ''' Get current data from germany. FInal data frame is stored as pd.DataFrame
    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))
    
    
    
if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()

Error : b''
out : b'Already up to date.\n'
 Number of regions rows: 411


### Step 2. Process pipeline

In [4]:
# %load src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime

def store_relational_JH_dataset():
    ''' Transformes the COVID data into a relational data set.'''

    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country','Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)
        
    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')
    print(pd_relational_model.columns)
    print(pd_relational_model.head(5))
    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
    
    
    
if __name__ == '__main__':

    store_relational_JH_dataset()
    
    

Index(['date', 'state', 'country', 'confirmed'], dtype='object')
        date                         state         country  confirmed
0 2020-01-22                       Alberta          Canada        0.0
1 2020-01-22                      Anguilla  United Kingdom        0.0
2 2020-01-22                         Anhui           China        1.0
3 2020-01-22                         Aruba     Netherlands        0.0
4 2020-01-22  Australian Capital Territory       Australia        0.0
 Number of rows stored: 261630
 Latest date is: 2022-07-27 00:00:00


### Step 3. Filter and Doubling Rate Calculation

In [None]:
# %load src/features/build_features.py
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal

def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate
        Input Parameters:- in_array : pandas.series
        Outputs:- Doubling rate: double, after how many days number of infected people count will double.
    '''
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol filter for the purpose of smoothing the data without distorting it'''
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)
        Input:-
        df_input : pandas.series - data for each individual contry and state undergoes filtering
        column : name
        window : used data points to calculate the filter result
        Output:-
        df_result: the index of the df_input has to be preserved in result
    '''
    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           degree)
    df_result[str(column+'_filtered')]=result
    return df_result


def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame.
        Input:-
        df_input: input data frame 
        filter_on: defines the used column
        Output:-
        df_output: the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Error in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # copy otherwise the filter_on column will be overwritten
    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    
    return df_output.copy()


def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time. '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result


def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame
        Input:-
        df_input: input data frame 
        filter_on: defines the used column
        Output:-
        df_output: the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Error in calc_doubling_rate not all columns in data frame'

    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])
    #print(df_output.head(5))
    return df_output



if __name__ == '__main__':
    print('Start: Feature building.')
    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    #print(pd_JH_data.head(4))
    
    # sorting - we assume the sliding window; going from top to bottom step by step.
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()
    
    # Index reset to compensate the dropped index during sorting preventing out of order results.
    pd_result_larg=calc_filtered_data(pd_JH_data)
    
    # doubling rate for the non filtered data.
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    
    # doubling rate for filtered data.
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')

    # Masking data (lower than 100 doubling rate) with NaN which has  (better visual rendering)
    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    #print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())
    print('Complete: Feature Building.')

Start: Feature building.


### Step 4. Visual Board

In [None]:
# %load src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
from dash import dcc as dcc
from dash import html as html
#import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')
print(df_input_large.head(5))


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([
    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data
    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.
    '''),
    
    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),

    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': country,'value':country} for country in df_input_large['country'].unique()],
        value=['US', 'Germany','India'], # pre-selected
        multi=True
    ),
    
    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
    '''),
    
    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),
    
    dcc.Graph(figure=fig, id='main_window_slope')
])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):
    if 'doubling_rate' in show_doubling:
        my_yaxis={
            'type':"log",
            'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={
            'type':"log",
            'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
            }
        
    traces = []
    for country in country_list:
        df_plot=df_input_large[df_input_large['country']==country]
        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
        
        traces.append(
            dict(
                    x=df_plot.date,
                    y=df_plot[show_doubling],
                    mode='markers+lines',
                    opacity=0.9,
                    name=country
                )
        )
    
    return {
            'data': traces,
            'layout': dict (
                            width=1280,
                            height=720,
                            xaxis={'title':'Timeline',
                                'tickangle':-45,
                                'nticks':20,
                                'tickfont':dict(size=14,color="#7f7f7f"),
                              },
                            yaxis=my_yaxis
                    )
            }


if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)