# Data Load


In [52]:
import subprocess
import os
import requests
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
%matplotlib inline
import plotly.graph_objs as go

In [53]:
df_analyse= pd.read_csv('../data/processed/COVID_small_flat_table.csv',parse_dates=[0])
df_analyse.sort_values('date', ascending= True). tail()

Unnamed: 0,date,Australia,China,Denmark,Canada,France,United Kingdom,India
229,2020-09-07,26373,90078,18540,134295,367174,352451,4280422
230,2020-09-08,26465,90087,18784,135757,373718,354932,4370128
231,2020-09-09,26524,90100,19036,136135,383292,357613,4465863
232,2020-09-10,26564,90127,19353,136956,392243,360544,4562414
233,2020-09-11,26607,90145,19646,137676,401890,364088,4659984


In [54]:
country_list=df_analyse.columns[1:]

# Helper Functions

In [55]:
def quick_plot(x_in, df_input, y_scale='log',slider= True):
    fig=go.Figure()
    for each in df_input.columns[0:3]:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        ))
    for each in df_input.columns[4:]:
        fig.add_trace(go.Scatter(
                        x=x_in, y=df_input[each], name=each,
                         opacity= 0.8,
        ))
   
    fig.update_layout(autosize=True,
                     width=1024,
                     height=768,
                     font=dict(
                     family= "PT Sans, monospace",
                     size=18,
                     color="#7f7f7f"
                     )
                )
    fig.update_yaxes(type=y_scale) #range=[0.1,2]
    fig.update_xaxes(tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14, color="#7f7f7f")
                    )
    if slider==True:
    
        fig.update_layout(xaxis_rangeslider_visible= True)
        fig.show()
        

In [56]:
quick_plot(df_analyse.date, df_analyse.iloc[:,1:],
          y_scale='log', slider=True)

In [57]:
threshold=100

In [58]:
compare_list=[]
for pos, country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]> threshold]))    

In [59]:
pd_sync_timelines= pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

In [60]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [61]:
print(pd_sync_timelines)

     Australia    China  Denmark  Canada  France  United Kingdom  India  date
0        107.0    548.0    264.0   108.0   130.0           134.0  102.0     0
1        128.0    643.0    444.0   117.0   191.0           189.0  113.0     1
2        128.0    920.0    617.0   193.0   204.0           246.0  119.0     2
3        200.0   1406.0    804.0   198.0   288.0           295.0  142.0     3
4        250.0   2075.0    836.0   252.0   380.0           374.0  156.0     4
..         ...      ...      ...     ...     ...             ...    ...   ...
229        NaN  90078.0      NaN     NaN     NaN             NaN    NaN   229
230        NaN  90087.0      NaN     NaN     NaN             NaN    NaN   230
231        NaN  90100.0      NaN     NaN     NaN             NaN    NaN   231
232        NaN  90127.0      NaN     NaN     NaN             NaN    NaN   232
233        NaN  90145.0      NaN     NaN     NaN             NaN    NaN   233

[234 rows x 8 columns]


In [62]:
def quick_plot1(x_in, df_input, y_scale='log',slider= False):
    fig=go.Figure()
    for each in df_input.columns[0:3]:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,marker=dict(color=['red','blue']),mode='markers', opacity= 0.5
                        ))
    for each in df_input.columns[4:]:
        fig.add_trace(go.Scatter(
                        x=x_in, y=df_input[each], name=each,
                         opacity= 0.8,
        ))
   
    fig.update_layout(autosize=True,
                     width=1024,
                     height=768,
                     font=dict(
                     family= "PT Sans, monospace",
                     size=18,
                     color="#7f7f7f"
                     )
                )
    fig.update_yaxes(type=y_scale) #range=[0.1,2]
    fig.update_xaxes(tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14, color="#7f7f7f")
                    )
    if slider==True:
    
        fig.update_layout(xaxis_rangeslider_visible= True)
        fig.show()
        

In [63]:
quick_plot1(pd_sync_timelines.date, pd_sync_timelines.iloc[:,1:],
          y_scale='log', slider=True)

$N(t)= N_0*2^{t/T}$

In [64]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power (2,t/T_d)

In [65]:
max_days=100
norm_slopes={
    'doubling every day': doubling_rate(100, np.arange(max_days),1),
    'doubling every 2 days': doubling_rate(100, np.arange(max_days),2),
    'doubling every 4 days': doubling_rate(100, np.arange(max_days),4),
    'doubling every 10 days': doubling_rate(100, np.arange(max_days),10)
}

In [66]:
norm_slopes

{'doubling every day': array([1.00000000e+02, 2.00000000e+02, 4.00000000e+02, 8.00000000e+02,
        1.60000000e+03, 3.20000000e+03, 6.40000000e+03, 1.28000000e+04,
        2.56000000e+04, 5.12000000e+04, 1.02400000e+05, 2.04800000e+05,
        4.09600000e+05, 8.19200000e+05, 1.63840000e+06, 3.27680000e+06,
        6.55360000e+06, 1.31072000e+07, 2.62144000e+07, 5.24288000e+07,
        1.04857600e+08, 2.09715200e+08, 4.19430400e+08, 8.38860800e+08,
        1.67772160e+09, 3.35544320e+09, 6.71088640e+09, 1.34217728e+10,
        2.68435456e+10, 5.36870912e+10, 1.07374182e+11, 2.14748365e+11,
        4.29496730e+11, 8.58993459e+11, 1.71798692e+12, 3.43597384e+12,
        6.87194767e+12, 1.37438953e+13, 2.74877907e+13, 5.49755814e+13,
        1.09951163e+14, 2.19902326e+14, 4.39804651e+14, 8.79609302e+14,
        1.75921860e+15, 3.51843721e+15, 7.03687442e+15, 1.40737488e+16,
        2.81474977e+16, 5.62949953e+16, 1.12589991e+17, 2.25179981e+17,
        4.50359963e+17, 9.00719925e+17, 1.

In [67]:
pd_sync_timelines_w_slope= pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines],axis=1)

In [68]:
pd_sync_timelines_w_slope

Unnamed: 0,doubling every day,doubling every 2 days,doubling every 4 days,doubling every 10 days,Australia,China,Denmark,Canada,France,United Kingdom,India,date
0,100.0,100.000000,100.000000,100.000000,107.0,548.0,264.0,108.0,130.0,134.0,102.0,0
1,200.0,141.421356,118.920712,107.177346,128.0,643.0,444.0,117.0,191.0,189.0,113.0,1
2,400.0,200.000000,141.421356,114.869835,128.0,920.0,617.0,193.0,204.0,246.0,119.0,2
3,800.0,282.842712,168.179283,123.114441,200.0,1406.0,804.0,198.0,288.0,295.0,142.0,3
4,1600.0,400.000000,200.000000,131.950791,250.0,2075.0,836.0,252.0,380.0,374.0,156.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
229,,,,,,90078.0,,,,,,229
230,,,,,,90087.0,,,,,,230
231,,,,,,90100.0,,,,,,231
232,,,,,,90127.0,,,,,,232


In [69]:
quick_plot(pd_sync_timelines_w_slope.date,pd_sync_timelines_w_slope.iloc[:,0:5],  y_scale='log', slider= True)


In [70]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

# Understanding Linear Regression

In [71]:
from sklearn import linear_model 
from scipy import signal

reg= linear_model.LinearRegression(fit_intercept=False)

In [72]:
l_vec=len(df_analyse['France'])
x=np.arange(l_vec-50).reshape(-1,1)
y=np.log(np.array(df_analyse['France'][50:]))


In [73]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [74]:
X_hat= np.arange(l_vec).reshape(-1,1)
Y_hat=reg.predict(X_hat)

In [75]:
LR_inspect= df_analyse [['date','France']].copy()

In [76]:
LR_inspect['prediction']=np.exp(Y_hat)

In [77]:
quick_plot(LR_inspect.date,
          LR_inspect.iloc[:,1:],
          y_scale='linear',
          slider=True)

# Doubling Rate- Piecewise Linear Regression

In [78]:
from sklearn import linear_model
from scipy import signal
reg = linear_model.LinearRegression(fit_intercept=True)

l_vec=len(df_analyse['France'])
x=np.arange(l_vec-50).reshape(-1,1)
y=np.log(np.array(df_analyse['France'][50:]))

In [79]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [80]:
reg.intercept_

10.711066249045231

In [81]:
reg.coef_

array([0.01294768])

In [82]:
reg.coef_/reg.intercept_

array([0.00120881])

In [83]:

df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',
                       parse_dates=[0])  
country_list=df_analyse.columns[1:]

In [84]:
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5, # window size used for filtering
                           1) # order of fitted polynomial

In [85]:
filter_cols=['France_filter','United Kingdom_filter', 'China_filter', 'Canada_filter']

In [86]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['China','China_filter']
           y_scale='log',
           slider=True)

In [87]:
df_analyse.head()

Unnamed: 0,date,Australia,China,Denmark,Canada,France,United Kingdom,India,Australia_filter,China_filter,Denmark_filter,Canada_filter,France_filter,United Kingdom_filter,India_filter
0,2020-01-22,0,548,0,0,0,0,0,-0.8,355.0,0.0,-0.2,-0.2,0.0,0.0
1,2020-01-23,0,643,0,0,0,0,0,8.881784e-16,736.7,0.0,2.220446e-16,0.7,0.0,0.0
2,2020-01-24,0,920,0,0,2,0,0,0.8,1118.4,0.0,0.2,1.6,0.0,0.0
3,2020-01-25,0,1406,0,0,3,0,0,1.8,1584.2,0.0,0.4,2.2,0.0,0.0
4,2020-01-26,4,2075,0,1,3,0,0,2.8,2557.4,0.0,0.8,3.0,0.0,0.0


In [88]:
def get_doubling_rate_via_regression(in_array):
    '''Use a linear regression to approximate the slope'''
     
    y=np.array(in_array)
    x=np.arange(-1,2).reshape(-1,1)
    
    
    
    assert len((in_array)==3)
    
    reg.fit(x,y)
    intercept= reg.intercept_
    slope= reg.coef_
    return intercept/slope

In [89]:
#df_analyse[country+'_DR']=df_analyse[country].rolling(window=3,min_periods=3).apply(get_rate_via_regression)

In [90]:
#quick_plot(df_analyse.date, df_analyse.iloc[40:,[6,7,8]], y_scale='linear')

In [91]:
#country_list=df_analyse.columns[1:]
#for each in country_list:
    
 #   df_analyse[each+'_DR']=df_analyse[each].rolling(window=3,min_periods=3).apply(get_rate_via_regression)

In [92]:
def doubling_time(in_array):
    '''Use a classical doubling time formula, See wikipedia doubling time...'''
    y=np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])
    
    

In [93]:
# calculate slope of regression of last x days
# use always a limited number of days to approximate the triangle, attention exponential base assumption
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_rate_via_regression, raw=False)

In [94]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_rate_via_regression, raw=False)

In [95]:
# cross check 
df_analyse['Canada_DR_math']=df_analyse['Canada'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [96]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_rate_via_regression, raw=False)

In [97]:
df_analyse.columns

Index(['date', 'Australia', 'China', 'Denmark', 'Canada', 'France',
       'United Kingdom', 'India', 'Australia_filter', 'China_filter',
       'Denmark_filter', 'Canada_filter', 'France_filter',
       'United Kingdom_filter', 'India_filter', 'Australia_DR', 'China_DR',
       'Denmark_DR', 'Canada_DR', 'France_DR', 'United Kingdom_DR', 'India_DR',
       'France_filter_DR', 'United Kingdom_filter_DR', 'China_filter_DR',
       'Canada_filter_DR', 'Canada_DR_math'],
      dtype='object')

In [98]:
start_pos=40 
quick_plot(df_analyse.date, df_analyse.iloc[50:,[8,9,10,11,12,13,14]],y_scale='linear', slider=True)

In [99]:
quick_plot(df_analyse.date, df_analyse.iloc[10:,[7,8,9,10]],
          y_scale='linear', slider= True)

In [100]:
quick_plot(df_analyse.date,df_analyse.iloc[50:,8:15],  y_scale='linear', slider=True)

In [101]:
df_analyse=df_analyse.drop(df_analyse.columns[8:], axis=1)

In [102]:
df_analyse.sort_values('date', ascending= True).tail()

Unnamed: 0,date,Australia,China,Denmark,Canada,France,United Kingdom,India
229,2020-09-07,26373,90078,18540,134295,367174,352451,4280422
230,2020-09-08,26465,90087,18784,135757,373718,354932,4370128
231,2020-09-09,26524,90100,19036,136135,383292,357613,4465863
232,2020-09-10,26564,90127,19353,136956,392243,360544,4562414
233,2020-09-11,26607,90145,19646,137676,401890,364088,4659984
