# Regression


_Akin Kazakci, MINES ParisTech, PSL University_

In [3]:
import pandas as pd
import numpy as np
from numpy import loadtxt
from numpy import sort

import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
import pickle

sns.set(style="white")

**Following are utility functions (mainly data preparation & feature generation). Scroll down to models & experiments for regression.**

In [4]:
def data_prep(columns_to_keep, cols_to_lag = None, incid_lag = False, incid_lag_start = 7, incid_lag_end = 1,
              col_lag_start = 21, col_lag_end = 14):
    # get influx data, regroup by department and date
    influx = pd.read_csv('influx_to_dept_by_date.csv', index_col = 0)
    influx = influx.groupby(['code_insee','date'], as_index = False).agg(
        {
            'movement':'sum',
            'length_km':'sum',
            'movement_baseline':'sum', 
            'movement_difference':'sum',
            'movement_percent_change':'sum', 
            'density_weighted_movement':'sum'

        }
    )
    #get incidence data
    incid = pd.read_csv('incidence.csv', index_col = 0)
    
    #get graph stats data
    stat = pd.read_csv('graph_stats_dept_by_date.csv',index_col = 0)
    
    #merge
    incid = pd.merge(incid, stat, how='left', on = ['code_insee','date'])
    data = pd.merge(incid, influx, how='left', on = ['code_insee','date'])
    
    #print ('cols after merge',data.columns)
    #drop non target columns
    data = data.drop(['incid_rea', 'incid_dc', 'incid_rad',
           'c_incidence', 'c_reanim', 'c_deces', 'c_rad'], axis = 1)
    #print(data.columns)
    #sort by date
    data = data.sort_values('date')
    
    # only keep columns specified for this experiment 
    #(assumes we have 'date','code_insee', 'date_time', 'time' in columns)
    # those are droppd by split_for_test later on
    data = data[columns_to_keep]
    
    # add or not, autoregressive variables on target
    if incid_lag:
        data = prep_lags(data, 'code_insee', ['incid_hosp'],incid_lag_start, incid_lag_end)
        #lags = range(incid_lag_end,incid_lag_start)
        #co = ['incid_hosp']
        #data.assign(**{
        #    '{} (t-{})'.format(col, t): data.groupby('code_insee')[col].shift(t)
        #    for t in lags
        #    for col in co
        #})
        #print('after incid', data.columns)
        # drop rows with NaNs
        data = data.dropna()
      
    else:
        pass
    
    if cols_to_lag is None:
        pass
    else:
        data = prep_lags(data,'code_insee', cols_to_lag, col_lag_start, col_lag_end )
        
    return data

In [5]:
def prep_lags(data, key, cols, lag_start, lag_end):
    lags = range(lag_end,lag_start+1)
    
    data = data.sort_values('date')
    #print('prep_lags sort',data.columns)
    # OLD CODE, DO NOT USE
    #data = data.assign(**{
    #    '{} (t-{})'.format(col, t): modified_shift(data, 'code_insee', col, t)
    #    for t in lags
    #    for col in cols
    #})
    for c in cols:
        for lag in lags:
            #print('c',c,'lag',lag)
            data = modified_shift(data, key, c, lag)
            #print(c, lag, data.columns)
    
    
    # drop rows with NaNs
    data = data.dropna()
    # drop original columns that are now lagged
    if cols != ['incid_hosp']:
        data = data.drop(cols, axis =1)
    #for c in data.columns:
        #print(c)
    
    return data

In [6]:
def modified_shift(data, key, c, lag):

   # data.assign(**{
   #     '{} (t-{})'.format(col, lag): data.groupby('key')[col].shift(lag)
   # })
    duc = data.groupby(key)[c].shift(lag)
    duc.rename(str(c)+ '(t-'+str(lag)+')', inplace = True)
   
    #print('duc',duc.head(2))
    data = pd.concat([
        data,
        duc
    ], axis=1)
    return data

In [7]:
#OLD, DO _NOT_ USE
def data_shift(dat, cols, lag):
    
    #plain old shift, for specified columns and lag
    #this needs to be changed if I want to have several shifts for same variables
    dat[cols] = dat[cols].shift(lag)
    dat = dat.dropna()    
    return dat

In [8]:
def split_for_test(data, split_date):
   
    dataset = data.sort_values('date')
    #print('split_for_test sort',dataset.columns)
    train, val = dataset[dataset.date <= split_date], dataset[dataset.date > split_date]
    train_with_codes, val_with_codes = train, val 
    # once the split for the split_date has been made, I can drop date
    # but also all the other columns the predictor won't take into account
    train.drop(['date','code_insee'], inplace = True, axis = 1)
    val.drop(['date','code_insee'], inplace  = True, axis =1)
    
    return train, val, train_with_codes, val_with_codes

In [18]:
def expe(model_name, columns_to_keep, cols_to_lag, incid_lag = False, incid_lag_start = 7, incid_lag_end = 1,
              col_lag_start = 21, col_lag_end = 14, depth = 3):
    
    data = data_prep(columns_to_keep, cols_to_lag, incid_lag , incid_lag_start, incid_lag_end ,
              col_lag_start, col_lag_end)
    
    #print(data.columns)
    #if you want to change features do here  
    #data['incid_hosp'] = np.exp(data['incid_hosp'])
   
    
    train, val, train_w_c, val_w_c = split_for_test(data,split_date = '2020-04-20' )
    #print(train.columns)

    #split data into X and y
    X_train, y_train = train.drop('incid_hosp', axis = 1), train['incid_hosp']
    X_test, y_test = val.drop('incid_hosp', axis = 1), val['incid_hosp']

    # split data into train and test sets
    tscv = TimeSeriesSplit(10)

    # fit model on all training data
    model =  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='mae',
             gamma=0.5, importance_type='gain', learning_rate=0.03,
             max_delta_step=0, max_depth=3, min_child_weight=1, 
             n_estimators=1000, n_jobs=-1, nthread=None, objective='count:poisson',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=True, subsample=1.0, verbosity=1, cv = tscv)
    model.fit(X_train, y_train)

    # make predictions for test data and evaluate
    predictions = model.predict(X_test)
    MAE = mean_absolute_error(y_test, predictions)
    
    # print score and other params
    if cols_to_lag is None:
        print("MAE: %.2f" % (MAE), 'train shape: ',train.shape)
    else:
        print("MAE: %.2f" % (MAE), 'train shape: ',train.shape, 'mov window: ',col_lag_start, col_lag_end )

    # get important features
    importances = (model.feature_importances_)
    selection = SelectFromModel(model, prefit=True)
    if model_name is not None:
        path = 'models/'
        data_path = 'data/expe_data'
        pickle.dump(model, open(os.path.join(path,model_name), "wb"))
        data.to_csv(os.path.join(data_path,'data_for_'+model_name+'.csv'))
        val_w_c.to_csv(os.path.join(data_path,'test_data_for_'+model_name+'.csv'))
      
    params = selection.get_params()

    support = selection.get_support()
    features = X_train.columns[support]

    #alternative dico for score charting

    dico = {'columns_to_keep':columns_to_keep,
            'cols_to_lag': cols_to_lag,
            'incid_lag': incid_lag,
            'incid_lag_start':incid_lag_start,
            'incid_lag_end':incid_lag_end ,
            'col_lag_start':col_lag_start , 
            'col_lag_end':col_lag_end , 
            'depth':depth , 
            'MAE':MAE,
            'features':features,
            'model_input_cols':X_train.columns,
            'model_parameters': params,
            'model_name':model_name}
    return dico
    

# Models & Experiments

In [22]:
#will keep results of experiments here:
results = [] 

## Baseline

The baseline only takes autoregressive varibles into account. Other specified parameters are not taken into account. 

In [20]:
cols = ['incid_hosp', 'date','code_insee'] # just the target column but with incid_past = True on expe(), so we have 6 days of history
cols_to_lag = None

dico = expe('exp_diff_baseline_auto_incid_7_mob14-21.pickle', cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = 21, col_lag_end = 14)
#print(dico)
results.append(dico)

MAE: 5.29 train shape:  (2626, 8)


Without much optimisation (simple trees, max_dept = 5, no grid search, etc), we have MAE between around 5 to 6. To give some perspective, this means ~500-600 errors per day, as we will predict on ~100 departments.

In [31]:
# change depth for auto-regressive features; score improves with more history, even though samples diminish
cols = ['incid_hosp', 'date','code_insee'] # just the target column but with incid_past = True on expe(), so we have 6 days of history
cols_to_lag = None

for i in range(2,25):
    dico = expe(None, cols, cols_to_lag ,incid_lag = True, incid_lag_start = i,
                incid_lag_end = 1, col_lag_start = 21, col_lag_end = 14)
#print(dico)
    results.append(dico)

MAE: 5.74 train shape:  (3131, 3)
MAE: 5.69 train shape:  (3030, 4)
MAE: 5.59 train shape:  (2929, 5)
MAE: 5.54 train shape:  (2828, 6)
MAE: 5.48 train shape:  (2727, 7)
MAE: 5.27 train shape:  (2626, 8)
MAE: 5.19 train shape:  (2525, 9)
MAE: 5.19 train shape:  (2424, 10)
MAE: 5.09 train shape:  (2323, 11)
MAE: 5.14 train shape:  (2222, 12)
MAE: 5.12 train shape:  (2121, 13)
MAE: 5.05 train shape:  (2020, 14)
MAE: 4.99 train shape:  (1919, 15)
MAE: 4.92 train shape:  (1818, 16)
MAE: 5.05 train shape:  (1717, 17)
MAE: 5.00 train shape:  (1616, 18)
MAE: 5.02 train shape:  (1515, 19)
MAE: 5.01 train shape:  (1414, 20)
MAE: 4.89 train shape:  (1313, 21)
MAE: 4.97 train shape:  (1212, 22)
MAE: 4.94 train shape:  (1111, 23)
MAE: 5.21 train shape:  (1010, 24)
MAE: 4.91 train shape:  (909, 25)


# Auto-regressive features with movement features

A critical point is the selection of the time window where we shall get the movement features from. Given the 5-6 days incubation time for Covid-19, and 10-14 days before patients go to hospitals (all of these statistics should be checked), a window of -21 to -14 days seems reasonable to include in the model. 

However, as the size of the windows increases the number of (lag) features multiply quickly, hence, the performance of the estimator decrease.

10-9 and 12-10 seems to be good windows for movement features. But the choice is tricky because of the window size issue. For ex, for 21-14 the MAE is 7.56 with train shape (313, 62). For 10-9, MAE is 6.4 with only 26 columns. (In all these models we have xgboost trees of depth 3, 1000 estimators and col_sample=0.1)

Still, the 10-9 window seems like a workable solution given the error rate and the objective of having a 'good enoughsimulator using movement data'

In [32]:
cols = ['incid_hosp', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement']

for i in range(10,22):
    for j in range(1, 8):
        #print(i,j)
        dico = expe('auto_mov_incid_7_mob{}-{}.pickle'.format(i-j,i),cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = i, col_lag_end = i-j)
    
results.append(dico)

MAE: 6.05 train shape:  (1242, 20) mov window:  10 9
MAE: 6.21 train shape:  (1242, 26) mov window:  10 8
MAE: 6.32 train shape:  (1242, 32) mov window:  10 7
MAE: 6.35 train shape:  (1242, 38) mov window:  10 6
MAE: 6.27 train shape:  (1242, 44) mov window:  10 5
MAE: 6.30 train shape:  (1242, 50) mov window:  10 4
MAE: 6.72 train shape:  (1242, 56) mov window:  10 3
MAE: 6.15 train shape:  (1154, 20) mov window:  11 10
MAE: 6.19 train shape:  (1154, 26) mov window:  11 9
MAE: 6.17 train shape:  (1154, 32) mov window:  11 8
MAE: 6.19 train shape:  (1154, 38) mov window:  11 7
MAE: 6.11 train shape:  (1154, 44) mov window:  11 6
MAE: 6.11 train shape:  (1154, 50) mov window:  11 5
MAE: 6.23 train shape:  (1154, 56) mov window:  11 4
MAE: 6.17 train shape:  (1067, 20) mov window:  12 11
MAE: 6.27 train shape:  (1067, 26) mov window:  12 10
MAE: 6.24 train shape:  (1067, 32) mov window:  12 9
MAE: 6.34 train shape:  (1067, 38) mov window:  12 8
MAE: 6.38 train shape:  (1067, 44) mov wind

# Graph, movement and auto-regressive features

Would the graph features we built would improve the scores? Note that again, 12-10 gives the best score. Also, 19-17 is a close by. This might be due to a "weekly cycle" effect.

In [25]:
cols = ['incid_hosp', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16','length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement',]

for i in range(10,22):
    for j in range(1, 8):
        dico = expe('obj_count_auto_mov_gr_incid_7_mob{}-{}.pickle'.format(i-j,i),cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = i, col_lag_end = i-j)
    
        results.append(dico)

MAE: 6.70 train shape:  (1026, 44) mov window:  10 9
MAE: 6.68 train shape:  (1026, 62) mov window:  10 8
MAE: 8.14 train shape:  (1026, 80) mov window:  10 7
MAE: 8.13 train shape:  (1026, 98) mov window:  10 6
MAE: 8.01 train shape:  (1026, 116) mov window:  10 5
MAE: 8.09 train shape:  (1026, 134) mov window:  10 4
MAE: 8.06 train shape:  (1026, 152) mov window:  10 3
MAE: 6.75 train shape:  (944, 44) mov window:  11 10
MAE: 6.99 train shape:  (944, 62) mov window:  11 9
MAE: 7.20 train shape:  (944, 80) mov window:  11 8
MAE: 8.51 train shape:  (944, 98) mov window:  11 7
MAE: 10.93 train shape:  (944, 116) mov window:  11 6
MAE: 10.87 train shape:  (944, 134) mov window:  11 5
MAE: 11.08 train shape:  (944, 152) mov window:  11 4
MAE: 6.72 train shape:  (864, 44) mov window:  12 11
MAE: 6.72 train shape:  (864, 62) mov window:  12 10
MAE: 6.59 train shape:  (864, 80) mov window:  12 9
MAE: 8.12 train shape:  (864, 98) mov window:  12 8
MAE: 8.30 train shape:  (864, 116) mov window

## Autoregressive features with graph features 

This time without original movement features, but only graph based features we built. On average, the results are slightly better than when we also have movement features. This might be due to the 'number of columns' issue mentioned. Nevertheless, graph features seems to be useful.

In [34]:
cols = ['incid_hosp','betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'date','code_insee'
       ]



cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16']

for i in range(10,22):
    for j in range(1, 8):
        dico = expe('auto_gr_incid_7_mob{}-{}.pickle'.format(i-j,i),cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = i, col_lag_end = i-j)
        results.append(dico)
#results

MAE: 6.38 train shape:  (1026, 32) mov window:  10 9
MAE: 6.74 train shape:  (1026, 44) mov window:  10 8
MAE: 7.14 train shape:  (1026, 56) mov window:  10 7
MAE: 7.10 train shape:  (1026, 68) mov window:  10 6
MAE: 7.32 train shape:  (1026, 80) mov window:  10 5
MAE: 7.69 train shape:  (1026, 92) mov window:  10 4
MAE: 7.84 train shape:  (1026, 104) mov window:  10 3
MAE: 6.30 train shape:  (944, 32) mov window:  11 10
MAE: 6.29 train shape:  (944, 44) mov window:  11 9
MAE: 6.86 train shape:  (944, 56) mov window:  11 8
MAE: 7.04 train shape:  (944, 68) mov window:  11 7
MAE: 6.96 train shape:  (944, 80) mov window:  11 6
MAE: 6.99 train shape:  (944, 92) mov window:  11 5
MAE: 7.64 train shape:  (944, 104) mov window:  11 4
MAE: 6.51 train shape:  (864, 32) mov window:  12 11
MAE: 6.54 train shape:  (864, 44) mov window:  12 10
MAE: 6.60 train shape:  (864, 56) mov window:  12 9
MAE: 6.88 train shape:  (864, 68) mov window:  12 8
MAE: 7.07 train shape:  (864, 80) mov window:  12 7


In [89]:
results

[{'columns_to_keep': ['incid_hosp', 'date', 'code_insee'],
  'cols_to_lag': None,
  'incid_lag': True,
  'incid_lag_start': 7,
  'incid_lag_end': 1,
  'col_lag_start': 21,
  'col_lag_end': 14,
  'depth': 3,
  'MAE': 5.272442127734718,
  'features': Index(['incid_hosp(t-1)', 'incid_hosp(t-2)'], dtype='object'),
  'model_input_cols': Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', 'incid_hosp(t-3)',
         'incid_hosp(t-4)', 'incid_hosp(t-5)', 'incid_hosp(t-6)',
         'incid_hosp(t-7)'],
        dtype='object'),
  'model_parameters': {'estimator__base_score': 0.5,
   'estimator__booster': 'gbtree',
   'estimator__colsample_bylevel': 1,
   'estimator__colsample_bynode': 1,
   'estimator__colsample_bytree': 1,
   'estimator__gamma': 0.5,
   'estimator__importance_type': 'gain',
   'estimator__learning_rate': 0.03,
   'estimator__max_delta_step': 0,
   'estimator__max_depth': 3,
   'estimator__min_child_weight': 1,
   'estimator__missing': None,
   'estimator__n_estimators': 1000,
   'est

With maximum 13 days past, the result does not change.

# Graph features without auto-regressive features

Can we predict without the auto-regressive features? Apparently not.

In [90]:
cols = ['incid_hosp','betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16']

for i in range(10,22):
    for j in range(1, 8):
        dico = expe('auto_gr_incid_7_mob{}-{}.pickle'.format(i-j,i),cols, cols_to_lag ,incid_lag = False, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = i, col_lag_end = i-j)
        results.append(dico)
    
        results.append(dico)

MAE: 19.13 train shape:  (1308, 37) mov window:  10 9
MAE: 19.84 train shape:  (1233, 49) mov window:  10 8
MAE: 20.69 train shape:  (1185, 61) mov window:  10 7
MAE: 21.14 train shape:  (1110, 73) mov window:  10 6
MAE: 20.33 train shape:  (1031, 85) mov window:  10 5
MAE: 18.96 train shape:  (974, 97) mov window:  10 4
MAE: 19.45 train shape:  (957, 109) mov window:  10 3
MAE: 17.64 train shape:  (1254, 37) mov window:  11 10
MAE: 19.12 train shape:  (1158, 49) mov window:  11 9
MAE: 19.43 train shape:  (1094, 61) mov window:  11 8
MAE: 20.54 train shape:  (1057, 73) mov window:  11 7
MAE: 20.12 train shape:  (987, 85) mov window:  11 6
MAE: 19.73 train shape:  (931, 97) mov window:  11 5
MAE: 19.15 train shape:  (914, 109) mov window:  11 4
MAE: 18.56 train shape:  (1183, 37) mov window:  12 11
MAE: 18.72 train shape:  (1087, 49) mov window:  12 10
MAE: 18.99 train shape:  (1016, 61) mov window:  12 9
MAE: 19.02 train shape:  (970, 73) mov window:  12 8
MAE: 20.33 train shape:  (934

# Graph, movement and autoregressive, more depth to xgboost

In [799]:
# depth 5
cols = ['incid_hosp', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement']

for i in range(10,22):
    for j in range(2, 7):
        dico = expe(cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
            incid_lag_end = 1, col_lag_start = i+j, col_lag_end = i, depth = 5)
    
        results.append(dico)

MAE: 7.06 (864, 80)
MAE: 7.38 (787, 98)
MAE: 7.55 (712, 116)
MAE: 7.58 (637, 134)
MAE: 7.55 (563, 152)
MAE: 7.45 (787, 80)
MAE: 7.50 (712, 98)
MAE: 7.49 (637, 116)
MAE: 7.52 (563, 134)
MAE: 8.18 (489, 152)
MAE: 7.59 (712, 80)
MAE: 7.45 (637, 98)
MAE: 7.49 (563, 116)
MAE: 8.27 (489, 134)
MAE: 8.11 (418, 152)
MAE: 7.46 (637, 80)
MAE: 7.69 (563, 98)
MAE: 8.19 (489, 116)
MAE: 8.39 (418, 134)
MAE: 8.65 (351, 152)
MAE: 7.52 (563, 80)
MAE: 8.04 (489, 98)
MAE: 8.41 (418, 116)
MAE: 8.51 (351, 134)
MAE: 8.36 (288, 152)
MAE: 8.22 (489, 80)
MAE: 8.47 (418, 98)
MAE: 8.44 (351, 116)
MAE: 8.42 (288, 134)
MAE: 8.70 (231, 152)
MAE: 8.36 (418, 80)
MAE: 8.66 (351, 98)
MAE: 8.66 (288, 116)
MAE: 8.74 (231, 134)
MAE: 8.24 (176, 152)
MAE: 8.07 (351, 80)
MAE: 8.72 (288, 98)
MAE: 8.42 (231, 116)
MAE: 8.28 (176, 134)
MAE: 7.55 (126, 152)
MAE: 8.61 (288, 80)
MAE: 8.26 (231, 98)
MAE: 8.20 (176, 116)
MAE: 7.44 (126, 134)
MAE: 7.91 (77, 152)
MAE: 8.20 (231, 80)
MAE: 7.80 (176, 98)
MAE: 7.51 (126, 116)
MAE: 7.34 (77

XGBoostError: [00:41:45] src/objective/regression_obj.cu:64: Check failed: info.labels_.Size() != 0U (0 vs. 0) : label set cannot be empty
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000001a38ec5469 dmlc::LogMessageFatal::~LogMessageFatal() + 57
  [bt] (1) 2   libxgboost.dylib                    0x0000001a38f49b29 xgboost::obj::RegLossObj<xgboost::obj::LinearSquareLoss>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*) + 281
  [bt] (2) 3   libxgboost.dylib                    0x0000001a38ec10ee xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) + 1278
  [bt] (3) 4   libxgboost.dylib                    0x0000001a38ee1f9c XGBoosterUpdateOneIter + 172
  [bt] (4) 5   libffi.6.dylib                      0x000000010fec3884 ffi_call_unix64 + 76
  [bt] (5) 6   ???                                 0x00007ffee14aea70 0x0 + 140732678204016



In [800]:
# depth 10, without looping over mobility windows
cols = ['incid_hosp', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement']

dico = expe(cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
    incid_lag_end = 1, col_lag_start = 21, col_lag_end = 14, depth = 10)
    
results.append(dico)

MAE: 8.65 (231, 170)


In [801]:
# depth vary, without looping over mobility windows
scores =  []
cols = ['incid_hosp', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement']
for i in range(3,30):
    dico = expe(cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
        incid_lag_end = 1, col_lag_start = 21, col_lag_end = 18, depth = i)

    scores.append(dico)

MAE: 9.08 (231, 170)
MAE: 8.60 (231, 170)
MAE: 8.56 (231, 170)
MAE: 8.67 (231, 170)
MAE: 8.49 (231, 170)
MAE: 8.59 (231, 170)
MAE: 8.61 (231, 170)
MAE: 8.65 (231, 170)
MAE: 8.69 (231, 170)
MAE: 8.67 (231, 170)
MAE: 8.88 (231, 170)
MAE: 8.90 (231, 170)
MAE: 8.89 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)
MAE: 8.87 (231, 170)


# Generate report

In [26]:
results

[{'columns_to_keep': ['incid_hosp',
   'betweenness_centrality-16_0',
   'closeness_centrality-16_0',
   'degree_centrality-16_0',
   'eigenvector_centrality-16_0',
   'betweenness_centrality-0_8',
   'closeness_centrality-0_8',
   'degree_centrality-0_8',
   'eigenvector_centrality-0_8',
   'betweenness_centrality-8_16',
   'closeness_centrality-8_16',
   'degree_centrality-8_16',
   'eigenvector_centrality-8_16',
   'length_km',
   'movement',
   'movement_baseline',
   'movement_difference',
   'movement_percent_change',
   'density_weighted_movement',
   'date',
   'code_insee'],
  'cols_to_lag': ['betweenness_centrality-16_0',
   'closeness_centrality-16_0',
   'degree_centrality-16_0',
   'eigenvector_centrality-16_0',
   'betweenness_centrality-0_8',
   'closeness_centrality-0_8',
   'degree_centrality-0_8',
   'eigenvector_centrality-0_8',
   'betweenness_centrality-8_16',
   'closeness_centrality-8_16',
   'degree_centrality-8_16',
   'eigenvector_centrality-8_16',
   'length_

In [27]:
results =pd.DataFrame(results)

In [28]:
results

Unnamed: 0,MAE,col_lag_end,col_lag_start,cols_to_lag,columns_to_keep,depth,features,incid_lag,incid_lag_end,incid_lag_start,model_input_cols,model_name,model_parameters
0,6.697733,9,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-1)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob9-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
1,6.675655,8,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-1)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob8-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
2,8.135531,7,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-6)', 'incid_hosp(t-7)',  ...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob7-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
3,8.126969,6,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-5)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob6-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
4,8.013447,5,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-5)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob5-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
5,8.092843,4,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-5)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob4-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
6,8.063771,3,10,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-5)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob3-10.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
7,6.745925,10,11,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-1)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob10-11.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
8,6.990921,9,11,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-1)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob9-11.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."
9,7.198216,8,11,"[betweenness_centrality-16_0, closeness_centra...","[incid_hosp, betweenness_centrality-16_0, clos...",3,"Index(['incid_hosp(t-1)', 'incid_hosp(t-6)', '...",True,1,7,"Index(['incid_hosp(t-1)', 'incid_hosp(t-2)', '...",obj_count_auto_mov_gr_incid_7_mob8-11.pickle,"{'estimator__base_score': 0.5, 'estimator__boo..."


In [29]:
results.to_csv('data/reports/report4.csv')

In [803]:
# depth vary, without looping over mobility windows
scores =  []
cols = ['incid_hosp', 
       'betweenness_centrality-16_0', 'closeness_centrality-16_0',
       'degree_centrality-16_0', 'eigenvector_centrality-16_0',
       'betweenness_centrality-0_8', 'closeness_centrality-0_8',
       'degree_centrality-0_8', 'eigenvector_centrality-0_8',
       'betweenness_centrality-8_16', 'closeness_centrality-8_16',
       'degree_centrality-8_16', 'eigenvector_centrality-8_16', 
       'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement','date','code_insee']

cols_to_lag = ['betweenness_centrality-16_0', 'closeness_centrality-16_0',
               'degree_centrality-16_0', 'eigenvector_centrality-16_0',
               'betweenness_centrality-0_8', 'closeness_centrality-0_8',
               'degree_centrality-0_8', 'eigenvector_centrality-0_8',
               'betweenness_centrality-8_16', 'closeness_centrality-8_16',
               'degree_centrality-8_16', 'eigenvector_centrality-8_16', 'length_km', 'movement', 'movement_baseline', 'movement_difference',
       'movement_percent_change', 'density_weighted_movement']
for i in range(3,30):
    dico = expe(cols, cols_to_lag ,incid_lag = True, incid_lag_start = 7,
        incid_lag_end = 1, col_lag_start = 21, col_lag_end = 18, depth = i)

    scores.append(dico)

MAE: 8.11 (231, 98)
MAE: 8.23 (231, 98)
MAE: 8.26 (231, 98)
MAE: 8.44 (231, 98)
MAE: 9.07 (231, 98)
MAE: 8.73 (231, 98)
MAE: 8.81 (231, 98)
MAE: 8.78 (231, 98)
MAE: 8.84 (231, 98)
MAE: 8.77 (231, 98)
MAE: 8.74 (231, 98)
MAE: 8.78 (231, 98)
MAE: 8.78 (231, 98)
MAE: 8.78 (231, 98)
MAE: 8.78 (231, 98)


KeyboardInterrupt: 