# Defining some necessary functions
### Defining some necessary functions for modifiying the data in a shape that's suitable for the analysis
### 1-Update the data; 2- Process Pipeline

In [19]:
import os
import os.path
from os import path
import subprocess
import pandas as pd
import numpy as np
import datetime
from scipy import signal
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

#***Defining some necessary functions for modifiying the data in a shape that's suitable for the analysis***

#defining function to check if the current directory/folder is correct, to avoid runtime-errors

def check_path(): 
    
    foldername = os.path.basename(os.getcwd())   #get the current path
    if foldername=='notebooks':                  #check if the current directory is in the main folder"COVID-19 Project"
        os.chdir("..")
    print("> Current directory is : " + os.getcwd()+", correct !")

#*** 1-Update the data ***
    
#defining function to commnicate to GITHUB and update the john hopkins COVID-19 data.

def git_comm():               
    if path.exists('data/raw/COVID-19')==True:
        print("> Communicating to GITHUB...")
        pipe = subprocess.Popen('git pull',         #github command pull the data from "https://github.com/cssegisanddata/COVID-19.git"
                    cwd = os.path.dirname('data/raw/COVID-19/'), shell = True,stdout = subprocess.PIPE,stderr = subprocess.PIPE)
        (out, error) = pipe.communicate()          #store the data in "data/raw/COVID-19/" & show output/error if any.
        print("> GITHUB: "+str(out)+","+str(error))

    else:
        
        print("> Downloading data from GITHUB...")
        pipe = subprocess.Popen('git clone https://github.com/cssegisanddata/COVID-19.git',         #github command clone the data from "https://github.com/cssegisanddata/COVID-19.git"
                    cwd = os.path.dirname('data/raw/'), shell = True,stdout = subprocess.PIPE,stderr = subprocess.PIPE)

        (out, error) = pipe.communicate()          #store the data in "data/raw/COVID-19/" & show output/error if any.
        print("> GITHUB: "+str(out)+","+str(error)) 

#*** 2-Process pipeline ***

#defining function to create the relational model "relational_model" through making some operations on the data like;
#removing the "Lat", "Long" columns and modifying the "date" data type

def create_relational_model():
    
    print("> Creating Relational model..")
    data_path=r'data\raw\COVID-19\csse_covid_19_data\csse_covid_19_time_series\time_series_covid19_confirmed_global.csv'
    data_raw=pd.read_csv(data_path)

    data_base=data_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})
    data_base['state']=data_base['state'].fillna('no')

    #operations done on the data set.
    relational_model=data_base.drop(['Lat','Long'],axis=1) \
                                .set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    relational_model['date']=relational_model.date.astype('datetime64[ns]') # chaning to date type

    relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False) #saving the data

    
    print("> Relational model created !  Last updated on: "+str(relational_model["date"].iloc[-1]))
    relational_model=relational_model.reset_index()

    return relational_model
#defining function to calculate the doubling time using linear regression and "reg.fit()", necessary for approximating the doubling rate

def get_doubling_time_via_regression(in_array):
    

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

def rolling_reg(df_input,col='confirmed'):
    
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

#defining function used to filter the data using groupby apply function

def savgol_filter(df_input,column='confirmed',window=5):
    
    window=5, 
    degree=1
    df_result=df_input        #to avoid destroying our Index
    
    filter_in=df_input[column].fillna(0) # filling with string
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result

#defining function to implement the features to the data like; doubling rate, doubling rate filtered, confirmed filtered/

def add_features():
    
    relational_model=create_relational_model()      #return the previously defined relational model.
    print("> Adding some features to the data....This might take some time..")
    pd_DR_result=relational_model[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed').reset_index()
#calculate Doubling_rate for the data set
    pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR',
                             'level_2':'index'})

    pd_result_larg=pd.merge(relational_model,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')

#calculate confirmed_filtered for the data set
    pd_filtered_result=relational_model[['state','country','confirmed']].groupby(['state','country']).apply(savgol_filter).reset_index()
    pd_result_larg=pd.merge(pd_result_larg,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
    print("> Almost done...")
#calculate filtered doubling rate for the data set

    pd_filtered_doubling=pd_result_larg[['state','country','confirmed_filtered']].groupby(['state','country']).apply(rolling_reg,'confirmed_filtered').reset_index()

    pd_filtered_doubling=pd_filtered_doubling.rename(columns={'confirmed_filtered':'confirmed_filtered_DR',
                             'level_2':'index'})
#merging the results together to get the final table
    pd_result_larg=pd.merge(pd_result_larg,pd_filtered_doubling[['index','confirmed_filtered_DR']],on=['index'],how='left')
    mask=pd_result_larg['confirmed']>300 #better for the doubling rates calculation
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN) 
#saving the final data to "data/processed/COVID_final_set.csv"
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print("> Final data has been created  !  saved in: " + os.getcwd()+"\data\processed\COVID_final_set.csv")
    print("> Results down vvvvv")
    return pd_result_larg

# 3-Filter and doubling Rate calculation
### calling the functions to see the results (might takes some time to implement; large data set)

In [20]:

#*** 3-Filter and doubling Rate calculation ***

#calling the functions to see the results (might takes some time to implement; large data set)

check_path()             # first, check the current path

git_comm()               #then, commnuicate with GITHUB to update and check the data set (you can skip it if you want to use the old data)

add_features().tail()    #third, modify and add the features to the data set;
                         #Note the function already includes creating the relational model.
    
#show the full data table with all the features

> Current directory is : C:\Users\Anwar\Desktop\COVID-19 Project, correct !
> Downloading data from GITHUB...
> GITHUB: b'',b"Cloning into 'COVID-19'...\nUpdating files:  82% (448/546)\rUpdating files:  83% (454/546)\rUpdating files:  84% (459/546)\rUpdating files:  85% (465/546)\rUpdating files:  86% (470/546)\rUpdating files:  87% (476/546)\rUpdating files:  88% (481/546)\rUpdating files:  89% (486/546)\rUpdating files:  90% (492/546)\rUpdating files:  91% (497/546)\rUpdating files:  92% (503/546)\rUpdating files:  93% (508/546)\rUpdating files:  94% (514/546)\rUpdating files:  95% (519/546)\rUpdating files:  96% (525/546)\rUpdating files:  97% (530/546)\rUpdating files:  98% (536/546)\rUpdating files:  99% (541/546)\rUpdating files: 100% (546/546)\rUpdating files: 100% (546/546), done.\n"
> Creating Relational model..
> Relational model created !  Last updated on: 2020-08-07 00:00:00
> Adding some features to the data....This might take some time..
> Almost done...
> Final data has 

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
52929,52929,2020-08-07,no,West Bank and Gaza,13722.0,40.776256,13697.2,44.810033
52930,52930,2020-08-07,no,Western Sahara,10.0,inf,10.0,
52931,52931,2020-08-07,no,Yemen,1796.0,107.616162,1790.6,134.651515
52932,52932,2020-08-07,no,Zambia,7486.0,31.137931,7445.6,33.107192
52933,52933,2020-08-07,no,Zimbabwe,4451.0,37.713043,4435.4,49.981609


# 4-SIR Model calculation
### you can run this part separetly from the part above with the data stored in the project "\data\processed\COVID_final_set.csv"


In [21]:
#*** 4-SIR Model calculation ***

# you can run this part separetly from the part above with the data stored in the project "\data\processed\COVID_final_set.csv"

from scipy import optimize
from scipy import integrate

N0=1000000  #max susceptible population
R0=0        #Basic Reproduction Number 
gamma=0.1   # recovery rate
beta=0.4    #infection spread dynamics
# the SIR model functions
def SIR_model(SIR,beta,gamma):

    S,I,R=SIR
    dS_dt=-beta*S*I/N0          #S*I is the 
    dI_dt=beta*S*I/N0-gamma*I
    dR_dt=gamma*I
    return([dS_dt,dI_dt,dR_dt])
    
def SIR_model_t(SIR,t,beta,gamma):

    S,I,R=SIR
    dS_dt=-beta*S*I/N0          #S*I is the 
    dI_dt=beta*S*I/N0-gamma*I
    dR_dt=gamma*I
    return dS_dt,dI_dt,dR_dt

#function to calculate the propagation rates but with changing beta between  0.11 to 0.4

def dynamic_beta_calculate(t_initial,t_intro_measures,t_hold,t_relax,S0,I0):
    
    beta_max=0.4
    beta_min=0.11
    pd_beta=np.concatenate((np.array(t_initial*[beta_max]),
                           np.linspace(beta_max,beta_min,t_intro_measures),
                           np.array(t_hold*[beta_min]),
                            np.linspace(beta_min,beta_max,t_relax),
                           ))
                
    SIR=np.array([S0,I0,R0])
    propagation_rates=pd.DataFrame(columns={'susceptible':S0,
                                        'infected':I0,
                                        'recoverd':R0})
            
    for each_beta in pd_beta:
       
            new_delta_vec=SIR_model(SIR,each_beta,gamma)
   
            SIR=SIR+new_delta_vec
    
            propagation_rates=propagation_rates.append({'susceptible':SIR[0],
                                                'infected':SIR[1],
                                                'recovered':SIR[2]}, ignore_index=True) 
    return propagation_rates


#function to calculate best fit
  
def fit_SIR_optimize(I0,S0,ydata,t):
    
    SIR=np.array([S0,I0,R0])
    propagation_rates=pd.DataFrame(columns={'susceptible':S0,
                                        'infected':I0,
                                        'recoverd':R0})
    def fit_odeint(x, beta, gamma):

        return integrate.odeint(SIR_model_t, (S0, I0, R0), t, args=(beta, gamma))[:,1] # we only would like to get dI

    for each_t in np.arange(100):
   
            new_delta_vec=SIR_model(SIR,beta,gamma)
   
            SIR=SIR+new_delta_vec
    
            propagation_rates=propagation_rates.append({'susceptible':SIR[0],
                                                'infected':SIR[1],
                                                'recovered':SIR[2]}, ignore_index=True)
   
    popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev=10000)
    fitted=fit_odeint(t, *popt)
    return ([fitted,popt]) 

    


# 5-Visual Board
### Visualize all the data on the dash board

In [22]:
#*** 5-Visual Board ***

import pandas as pd
import numpy as np
import os
import plotly.graph_objects as go
import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State



foldername = os.path.basename(os.getcwd())   #check the path again and modify it if necessary
if foldername=='notebooks':
    os.chdir("..")
    
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';') #assign the final data to df_input_large variable

fig = go.Figure()

app = dash.Dash()
    
app.layout = html.Div([

    dcc.Markdown('''
    ##  Applied Data Science on COVID-19 data

    Automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    #### Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each2,'value':each2} for each2 in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # countries that are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        #### Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='features_list',
    options=[
        {'label': 'Timeline Confirmed/Actual ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},

    ],
    value='confirmed',
    multi=False
    ),
    
        dcc.Markdown('''
    ### SIR Model
    #### Single-Select Country for visualization
    ###### (Recommended countries for testing: Germany, US, Japan, Russia, Singapore, Iran, Brazil, Algeria, Lebanon, India, Spain, Italy, Austria, Belgium, Finland, Greece, Norway, Bahrain, Cambodia, Chile, Croatia, Diamond Princess, Georgia, Iraq, South Korea, Kuwait, Malaysia, Nepal, North Macedonia, Oman, Romania, Sweden, Switzerland, Taiwan, Thailand, United Aram Emirates, Vietnam, Phillipines, Sri Lanka)  
    '''),


    dcc.Dropdown(
        id='country_drop_down2',
        options=[ {'label': each3,'value':each3} for each3 in df_input_large['country'].unique()],
        value='Germany', # countries that are pre-selected
        multi=False
    ),
    
    dcc.Markdown('''
    
    #### Fit for SIR Model & Dynamic beta in SIR-infection rate (might take some time to update in graph)
    '''),
    
        dcc.Dropdown(
    id='features_list2',
    options=[
        {'label': 'Deactivated', 'value': 'actual'},
        {'label': 'Fit for SIR Model', 'value': 'simulated'},
        {'label': 'Dynamic beta in SIR', 'value': 'dynamic'},

    ],
    value='actual',
    multi=False
    ),
        dcc.Markdown('''
    
    ##### Dynamic beta in SIR Parameters (Adjustable)
    '''),
#defining 4 sliders for the variables; t_initial, t_intro_measure, t_hold, t_relax for the Dynamic beta
     dcc.Markdown('''
    
    ###### t_initial
    '''),
    dcc.Slider(
        id='slider_1',
        min=0,
        max=100,
        step=1,
        value=21,     #default value t_initial, chosen based on Germany
        tooltip={'always_visible': True},

    ),
         dcc.Markdown('''
    
    ###### t_intro_measure
    '''),
        dcc.Slider(
        id='slider_2',
        min=0,
        max=100,
        step=1,
        value=20, 
        tooltip={'always_visible': True},  

    ),
         dcc.Markdown('''
    
    ###### t_hold
    '''),
        dcc.Slider(
        id='slider_3',
        min=0,
        max=100,
        step=1,
        value=32,
        tooltip={'always_visible': True},
        
    ),
         dcc.Markdown('''
    
    ###### t_relax
    '''),
        dcc.Slider(
        id='slider_4',
        min=0,
        max=140,
        step=1,
        value=82,
        tooltip={'always_visible': True},
            
     ),
            

    dcc.Graph(figure=fig, id='main_window_slope')

])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
     Input('features_list', 'value'),
     Input('country_drop_down2', 'value'),
     Input('features_list2', 'value'),
     Input('slider_1', 'value'),
     Input('slider_2', 'value'),
     Input('slider_3', 'value'),
     Input('slider_4', 'value')])
    
def update_figure(country_list,show_features,country_list2,show_features2,s1,s2,s3,s4):
    

    if 'confirmed_DR' in show_features:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 5 days'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people'
              }
    
    my_title=' '
    traces = []
    country_name=country_list2
    df_analyse=df_input_large[df_input_large['country']==country_name]
    ydata = np.array(df_analyse.confirmed[35:]) #ignoring the first 35 data
    t=np.arange(len(ydata))
    I0=ydata[0]        
    S0=N0-I0
    
    # if the "dynamic beta" is chosen
    if show_features2=='dynamic':

                my_title="beta_max=0.4, beta_min=0.11, gamma=0.1"
                propagation_rates=dynamic_beta_calculate(s1,s2,s3,s4,S0,I0) #calculate the propagation rates
                
                traces.append(dict(x=propagation_rates.index,
                                y=propagation_rates.infected,
                                mode='markers+lines',
                                opacity=0.9,
                                name=country_name+str(" (Simulated dynamic)")
                            )
                    )    
                traces.append(dict(x=t,
                                y=ydata,
                                mode='markers+lines',
                                opacity=0.9,
                                name=country_name+str(" (Actual Infected)")
                            )
                    ) 
                
    # if the "Fit for SIR is chosen
    elif show_features2=='simulated':
        
                popt=fit_SIR_optimize(I0,S0,ydata,t)[1]
                fitted=fit_SIR_optimize(I0,S0,ydata,t)[0]
                # show calculated optimal beta, gamma and Basic Reproduction Number R0
                my_title="Optimal parameters: beta ="+str(popt[0])+ ", and gamma = "+str(popt[1])+ "\n Basic Reproduction Number R0 "+str(popt[0]/ popt[1])
                
                traces.append(dict(x=t,
                                y=fitted,
                                mode='markers+lines',
                                opacity=0.9,
                                name=country_name+str(" (Simulated Fit)")
                            )
                    )    
                traces.append(dict(x=t,
                                y=ydata,
                                mode='markers+lines',
                                opacity=0.9,
                                 
                                name=country_name+str(" (Actual Infected)")
                            )
                    )    
                
    else:
        for each2 in country_list:
            df_plot=df_input_large[df_input_large['country']==each2]

            if show_features=='confirmed_filtered_DR':
                df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
                y_2=df_plot['confirmed_filtered_DR']

            else:
                df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
                y_2=df_plot[show_features]
      
            traces.append(dict(x=df_plot.date,
                                y=y_2,
                                mode='markers+lines',
                                opacity=0.9,
                                name=each2     
                        )
                )        

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,
                title=my_title,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis

        )
    }



if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)

    


Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
