In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline

mpl.rcParams['figure.figsize'] = (16, 10)
pd.set_option('display.max_rows', 500)
import plotly
import plotly.graph_objects as go

In [2]:
plotly.__version__

'4.6.0'

In [3]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  

df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,Italy,US,Spain,Germany,Nepal,India
216,2020-08-25,261174,5777710,412553,237583,33533,3224547
217,2020-08-26,262540,5821819,419849,239010,34418,3310234
218,2020-08-27,263949,5867785,429507,240571,35529,3387500
219,2020-08-28,265409,5913941,439286,242126,36456,3463972
220,2020-08-29,266853,5961094,439286,242835,37340,3542733


In [4]:
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        opacity=0.8))
    
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            )
        )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show(renderer="firefox")
#     fig.show()

In [5]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [6]:
threshold=100

compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

pd_sync_timelines.head()



Unnamed: 0,Italy,US,Spain,Germany,Nepal,India,date
0,155.0,104.0,120.0,130.0,101.0,102.0,0
1,229.0,174.0,165.0,159.0,102.0,113.0,1
2,322.0,222.0,222.0,196.0,110.0,119.0,2
3,453.0,337.0,259.0,262.0,110.0,142.0,3
4,655.0,451.0,400.0,482.0,134.0,156.0,4


In [7]:
quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)

## Doubling rate

In [8]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [9]:
max_days=180

norm_slopes={
    #'doubling every day':doubling_rate(100,np.arange(10),1),
    'doubling every two days':doubling_rate(100,np.arange(max_days),2),
    'doubling every 4 days':doubling_rate(100,np.arange(max_days),4),
    'doubling every 10 days':doubling_rate(100,np.arange(max_days),10),
}

In [10]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [11]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:5],
           y_scale='log',
           slider=True)

In [12]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

## Linear regression

In [13]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)


In [14]:
l_vec=len(df_analyse['Nepal'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['Nepal'][5:]))

# do a simple regression on all data

reg.fit(X,y)

X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

LR_inspect=df_analyse[['date','Nepal']].copy()

LR_inspect['prediction']=np.exp(Y_hat)

In [15]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

## Piecewise Linear Regression Doubling rate

In [16]:
from sklearn import linear_model
from scipy import signal

reg = linear_model.LinearRegression(fit_intercept=True)

df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  

country_list=df_analyse.columns[1:]




In [17]:
country_list

Index(['Italy', 'US', 'Spain', 'Germany', 'Nepal', 'India'], dtype='object')

In [18]:
## filter data

for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5, # window size used for filtering
                           1) # order of fitted polynomial

In [19]:
# filter_cols=['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter']

filter_cols=df_analyse.columns[len(country_list)+1:]  #after date and country list, remaining are filters


In [20]:
filter_cols

Index(['Italy_filter', 'US_filter', 'Spain_filter', 'Germany_filter',
       'Nepal_filter', 'India_filter'],
      dtype='object')

In [21]:
df_analyse.head()

Unnamed: 0,date,Italy,US,Spain,Germany,Nepal,India,Italy_filter,US_filter,Spain_filter,Germany_filter,Nepal_filter,India_filter
0,2020-01-22,0,1,0,0,0,0,0.0,0.4,0.0,0.0,-0.2,0.0
1,2020-01-23,0,1,0,0,0,0,0.0,1.3,0.0,0.0,0.1,0.0
2,2020-01-24,0,2,0,0,0,0,0.0,2.2,0.0,0.0,0.4,0.0
3,2020-01-25,0,2,0,0,1,0,0.0,3.0,0.0,0.2,0.6,0.0
4,2020-01-26,0,5,0,0,1,0,0.0,3.8,0.0,1.0,0.8,0.0


In [22]:
df_analyse[filter_cols].columns

Index(['Italy_filter', 'US_filter', 'Spain_filter', 'Germany_filter',
       'Nepal_filter', 'India_filter'],
      dtype='object')

In [23]:
df_analyse[filter_cols].iloc[3:5,3:5]

Unnamed: 0,Germany_filter,Nepal_filter
3,0.2,0.6
4,1.0,0.8


In [24]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['US','US_filter']
           y_scale='log',
           slider=True)

In [25]:
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope


In [26]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [27]:
# calculate slope of regression of last x days
# use always a limited number of days to approximate the triangle, attention exponential base assumption
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [28]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [29]:
# cross check the matematical 
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [30]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [31]:
df_analyse

Unnamed: 0,date,Italy,US,Spain,Germany,Nepal,India,Italy_filter,US_filter,Spain_filter,...,Germany_DR,Nepal_DR,India_DR,Italy_filter_DR,US_filter_DR,Spain_filter_DR,Germany_filter_DR,Nepal_filter_DR,India_filter_DR,Germany_DR_math
0,2020-01-22,0,1,0,0,0,0,0.0,0.4,0.0,...,,,,,,,,,,
1,2020-01-23,0,1,0,0,0,0,0.0,1.3,0.0,...,,,,,,,,,,
2,2020-01-24,0,2,0,0,0,0,0.0,2.2,0.0,...,,,,,1.444444,,,0.3333333,,
3,2020-01-25,0,2,0,0,1,0,0.0,3.0,0.0,...,,0.666667,,,2.54902,,0.6666667,1.466667,,
4,2020-01-26,0,5,0,0,1,0,0.0,3.8,0.0,...,,1.333333,,,3.75,,0.8,3.0,,
5,2020-01-27,0,5,0,1,1,0,0.0,4.4,0.0,...,0.666667,inf,,,5.333333,,1.25,4.0,,0.0
6,2020-01-28,0,5,0,4,1,0,0.0,5.0,0.0,...,0.833333,inf,,,7.333333,,2.25,9.333333,0.666667,0.0
7,2020-01-29,0,5,0,4,1,0,0.4,5.4,0.0,...,2.0,inf,,0.6666667,9.866667,,2.962963,3.8388330000000004e+31,1.0,1.5
8,2020-01-30,0,5,0,4,1,1,0.8,6.0,0.2,...,inf,inf,0.666667,1.0,10.93333,0.6666667,3.111111,3.8388330000000004e+31,2.0,inf
9,2020-01-31,2,7,0,5,1,1,1.2,6.6,0.4,...,8.666667,inf,1.333333,2.0,10.0,1.0,3.794872,3.8388330000000004e+31,2.222222,9.318851


In [39]:
start_pos=70
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[13,14,15,16,17,18]], #
           y_scale='linear',
           slider=True)

In [40]:
start_pos=70
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[19,20,21,22,23,24]], #
           y_scale='linear',
           slider=True)