# Build lag features for Covid-19 incidence regression

_Akin Kazakci, MINES ParisTech, PSL University_

Input:
- incidences_merged (Notebook 6_merge_csvs)

Output:
- data.csv

To predict daily Covid-19 numbers per department, we will later use movement variables observed in the past 2 to 3 weeks. Here we will build a dataframe where each line corresponds to daily numbers and the corresponding history (minus 15 to minus 21 days, for movement related information and last weeks Covid-19 numbers). 

In [1]:
import os
import json
import numpy as np
import pandas as pd
%matplotlib inline

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model

## Read Data

In [2]:
data = pd.read_csv('incidences_merged.csv', index_col = 0)
data.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,code_insee,date,incid_hosp,incid_rea,incid_dc,incid_rad,c_incidence,c_reanim,c_deces,c_rad,...,degree_centrality-16_0,eigenvector_centrality-16_0,betweenness_centrality-0_8,closeness_centrality-0_8,degree_centrality-0_8,eigenvector_centrality-0_8,betweenness_centrality-8_16,closeness_centrality-8_16,degree_centrality-8_16,eigenvector_centrality-8_16
0,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.051282,6e-06,0.075756,0.212346,0.05814,0.009571,0.043078,0.165428,0.044944,9.2e-05
1,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.051282,6e-06,0.075756,0.212346,0.05814,0.009571,0.043078,0.165428,0.044944,9.2e-05
2,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.051282,6e-06,0.075756,0.212346,0.05814,0.009571,0.043078,0.165428,0.044944,9.2e-05


In [3]:
data.columns

Index(['code_insee', 'date', 'incid_hosp', 'incid_rea', 'incid_dc',
       'incid_rad', 'c_incidence', 'c_reanim', 'c_deces', 'c_rad',
       'intra_mean_indegree-16_0', 'intra_node_count-16_0', 'intra_move-16_0',
       'intra_move_base-16_0', 'intra_move_diff-16_0',
       'intra_end_density-16_0', 'intra_end_dens_base-16_0',
       'intra_start_density-16_0', 'intra_start_dens_base-16_0',
       'intra_length_km-16_0', 'intra_density_weighted_movement-16_0',
       'intra_mean_indegree-0_8', 'intra_node_count-0_8', 'intra_move-0_8',
       'intra_move_base-0_8', 'intra_move_diff-0_8', 'intra_end_density-0_8',
       'intra_end_dens_base-0_8', 'intra_start_density-0_8',
       'intra_start_dens_base-0_8', 'intra_length_km-0_8',
       'intra_density_weighted_movement-0_8', 'intra_mean_indegree-8_16',
       'intra_node_count-8_16', 'intra_move-8_16', 'intra_move_base-8_16',
       'intra_move_diff-8_16', 'intra_end_density-8_16',
       'intra_end_dens_base-8_16', 'intra_start_densit

# Build lag features

We shall assume that anything we observe today is the result of -15 to -21 movements' result (incubation + apparition of symptoms that require going to hospital)

In [54]:
# Variables I built using FB data (intra, inter and graph features)
lags = range(15,21)
cols = ['intra_mean_indegree-16_0', 'intra_node_count-16_0', 'intra_move-16_0',
       'intra_move_base-16_0', 'intra_move_diff-16_0',
       'intra_end_density-16_0', 'intra_end_dens_base-16_0',
       'intra_start_density-16_0', 'intra_start_dens_base-16_0',
       'intra_length_km-16_0', 'intra_density_weighted_movement-16_0',
       'intra_mean_indegree-0_8', 'intra_node_count-0_8', 'intra_move-0_8',
       'intra_move_base-0_8', 'intra_move_diff-0_8', 'intra_end_density-0_8',
       'intra_end_dens_base-0_8', 'intra_start_density-0_8',
       'intra_start_dens_base-0_8', 'intra_length_km-0_8',
       'intra_density_weighted_movement-0_8', 'intra_mean_indegree-8_16',
       'intra_node_count-8_16', 'intra_move-8_16', 'intra_move_base-8_16',
       'intra_move_diff-8_16', 'intra_end_density-8_16',
       'intra_end_dens_base-8_16', 'intra_start_density-8_16',
       'intra_start_dens_base-8_16', 'intra_length_km-8_16',
       'intra_density_weighted_movement-8_16', 'length_km',
       'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16']
        

       
dro = data.copy()
dro = dro.assign(**{
    '{} (t-{})'.format(col, t): dro[col].shift(t)
    for t in lags
    for col in cols
})


In [55]:
dro.head(3)

Unnamed: 0,code_insee,date,incid_hosp,incid_rea,incid_dc,incid_rad,c_incidence,c_reanim,c_deces,c_rad,...,degree_centrality-16_0 (t-20),eigenvector_centrality-16_0 (t-20),betweenness_centrality-0_8 (t-20),closeness_centrality-0_8 (t-20),degree_centrality-0_8 (t-20),eigenvector_centrality-0_8 (t-20),betweenness_centrality-8_16 (t-20),closeness_centrality-8_16 (t-20),degree_centrality-8_16 (t-20),eigenvector_centrality-8_16 (t-20)
0,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1,2020-03-19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,


We shall augment this lagged dataset with lags from the prediction target. We take the last 6 days observation, which will also serve us to build a baseline.

In [56]:
lags = range(1,7)
cols = ['incid_hosp']

dro2 = data.copy()
dro2 = dro.assign(**{
    '{} (t-{})'.format(col, t): dro[col].shift(t)
    for t in lags
    for col in cols
})

In [57]:
dro2.columns

Index(['code_insee', 'date', 'incid_hosp', 'incid_rea', 'incid_dc',
       'incid_rad', 'c_incidence', 'c_reanim', 'c_deces', 'c_rad',
       ...
       'betweenness_centrality-8_16 (t-20)',
       'closeness_centrality-8_16 (t-20)', 'degree_centrality-8_16 (t-20)',
       'eigenvector_centrality-8_16 (t-20)', 'incid_hosp (t-1)',
       'incid_hosp (t-2)', 'incid_hosp (t-3)', 'incid_hosp (t-4)',
       'incid_hosp (t-5)', 'incid_hosp (t-6)'],
      dtype='object', length=387)

We shall drop today's movement related features as they do not affect today's observed number of patients. We shall also drop some Covid-19 incidence data (keeping only our main prediction target 'incid_hosp'

In [58]:
#dr = dro2.iloc[:,63:]
#dr.columns

In [59]:
#dro.dropna(axis = 0, inplace = True)
exclude = ['incid_rea',
 'incid_dc',
 'incid_rad',
 'c_incidence',
 'c_reanim',
 'c_deces',
 'c_rad',
 'intra_mean_indegree-16_0',
 'intra_node_count-16_0',
 'intra_move-16_0',
 'intra_move_base-16_0',
 'intra_move_diff-16_0',
 'intra_end_density-16_0',
 'intra_end_dens_base-16_0',
 'intra_start_density-16_0',
 'intra_start_dens_base-16_0',
 'intra_length_km-16_0',
 'intra_density_weighted_movement-16_0',
 'intra_mean_indegree-0_8',
 'intra_node_count-0_8',
 'intra_move-0_8',
 'intra_move_base-0_8',
 'intra_move_diff-0_8',
 'intra_end_density-0_8',
 'intra_end_dens_base-0_8',
 'intra_start_density-0_8',
 'intra_start_dens_base-0_8',
 'intra_length_km-0_8',
 'intra_density_weighted_movement-0_8',
 'intra_mean_indegree-8_16',
 'intra_node_count-8_16',
 'intra_move-8_16',
 'intra_move_base-8_16',
 'intra_move_diff-8_16',
 'intra_end_density-8_16',
 'intra_end_dens_base-8_16',
 'intra_start_density-8_16',
 'intra_start_dens_base-8_16',
 'intra_length_km-8_16',
 'intra_density_weighted_movement-8_16',
 'length_km-16_0',
 'movement-16_0',
 'movement_baseline-16_0',
 'movement_difference-16_0',
 'movement_percent_change-16_0',
 'density_weighted_movement-16_0',
 'length_km-0_8',
 'movement-0_8',
 'movement_baseline-0_8',
 'movement_difference-0_8',
 'movement_percent_change-0_8',
 'density_weighted_movement-0_8',
 'length_km-8_16',
 'movement-8_16',
 'movement_baseline-8_16',
 'movement_difference-8_16',
 'movement_percent_change-8_16',
 'density_weighted_movement-8_16',
 'betweenness_centrality-16_0',
 'closeness_centrality-16_0',
 'degree_centrality-16_0',
 'eigenvector_centrality-16_0',
 'betweenness_centrality-0_8',
 'closeness_centrality-0_8',
 'degree_centrality-0_8',
 'eigenvector_centrality-0_8',
 'betweenness_centrality-8_16',
 'closeness_centrality-8_16',
 'degree_centrality-8_16',
 'eigenvector_centrality-8_16']
print(dro2.shape)



(15302, 387)


In [60]:
mask = ~dro2.columns.isin(exclude)
dro2 = dro2.loc[:,mask].dropna(axis = 0)

In [65]:
dro2.sort_values('date', inplace=True)
dro2.head(3)

Unnamed: 0,code_insee,date,incid_hosp,date_time,length_km,movement,movement_baseline,movement_difference,movement_percent_change,density_weighted_movement,...,betweenness_centrality-8_16 (t-20),closeness_centrality-8_16 (t-20),degree_centrality-8_16 (t-20),eigenvector_centrality-8_16 (t-20),incid_hosp (t-1),incid_hosp (t-2),incid_hosp (t-3),incid_hosp (t-4),incid_hosp (t-5),incid_hosp (t-6)
11408,85,2020-03-04,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.279643,0.530751,0.306452,0.33674,0.0,0.0,0.0,0.0,0.0,0.0
11335,54,2020-03-04,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11405,82,2020-03-04,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.232697,0.582695,0.379121,0.332802,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
dro2.to_csv('data.csv')