# Building graph features from movement data


_Akin Kazakci, Mines ParisTech - PSL University_

Input: 
- movement_with_density_code_insee_(date).csv (from notebook 2)

Output:
- raw_graph_stats_dataframe.csv
- centrality_stats.csv

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import matplotlib
import numpy as np

In [2]:
# movement_with_density_code_insee corresponds to the merge of movement and population density data.
moves = pd.read_csv('movement_with_density_code_insee_07-05-2020.csv', index_col = 0)

In [3]:
moves.shape

(51027, 31)

In [5]:
#groups = moves.groupby(['date','end_code_insee', 'start_code_insee'],as_index=False)
#groups.head(10)

In [6]:
moves.columns

Index(['geometry', 'date_time', 'start_polygon_id', 'start_polygon_name',
       'end_polygon_id', 'end_polygon_name', 'length_km', 'tile_size',
       'country', 'level', 'n_crisis', 'n_baseline', 'n_difference',
       'percent_change', 'is_statistically_significant', 'z_score',
       'start_lat', 'start_lon', 'end_lat', 'end_lon', 'date',
       'end_code_insee', 'end_point', 'start_code_insee', 'start_point',
       'end_density_crisis', 'end_density_baseline',
       'end_density_percent_change', 'start_density_crisis',
       'start_density_baseline', 'start_density_percent_change'],
      dtype='object')

In [7]:
moves = moves[[ 'date_time', 
    'start_polygon_id', 'start_polygon_name', # keep those cause incoming traffic / graph 
    'end_polygon_id', 'end_polygon_name', 
    'length_km',  # size of polygon is an interesting feature a priori, or is this length of movement? even better
    'n_crisis', 'n_baseline', 'n_difference',
    'percent_change', 
    'date', #key
    'end_code_insee',  'start_code_insee',  #keys
    'end_density_crisis', 'end_density_baseline',
    'start_density_crisis',
    'start_density_baseline' ]]

In [8]:
# I will change the variable names 'n_crisis', 'n_baseline', 'n_difference','percent_change' to 'movement',
# 'movement_baseline','movement', 'movement_baseline','movement_difference', 'movement_percent_change'
       
moves.columns =  ['date_time', 'start_polygon_id', 'start_polygon_name', 'end_polygon_id',
       'end_polygon_name', 'length_km', 'movement', 'movement_baseline',
       'movement_difference', 'movement_percent_change', 'date', 'end_code_insee',
       'start_code_insee', 'end_density_crisis', 'end_density_baseline',
       'start_density_crisis', 'start_density_baseline']

In [9]:
#moves.sort_values('date_time')

In [10]:
moves['date_time'] = pd.to_datetime(moves['date_time'])

In [11]:
moves['time']=moves['date_time'].apply(lambda x: x.time())

In [12]:
# A feature that would be useful is to weight the movement by density of the origin
# this is assuming that the denser a point is, the higher the chances it will bring infection to a destination
# alternative / complementary: use the number of patients from starting department
moves['density_weighted_movement'] = moves['movement'] * moves['start_density_crisis']

# Build graph features 

In [13]:
import networkx as nx

In [14]:
#prepare the new columns
moves['mean_degree_centrality'] =''
moves['mean_betweenness_centrality'] =''
moves['mean_closeness_centrality'] =''
moves['mean_node_eigenvector_centrality'] =''
moves['max_degree_centrality'] =''
moves['max_betweenness_centrality'] =''
moves['max_closeness_centrality'] =''
moves['max_eigenvector_centrality'] =''

In [15]:
# create graphs per 8hour time slices

#first regroup rows per 8hours slices (datetime)
groups = moves.groupby('date_time')

#then loop over time slices
graph_stats = pd.DataFrame(columns = ['degree_centrality', 'polygon_id'])
for name, group in groups:
    print (name)
    #create a graph for that time slice
    g = nx.from_pandas_edgelist(group, source='start_polygon_id', target='end_polygon_id', 
                                 edge_attr='density_weighted_movement') 
    
    #compute stats for each graph
    #a = pd.DataFrame.from_dict(dict(nx.betweenness_centrality(g)), columns = ['betweenness_centrality'],orient='index')
    #b = pd.DataFrame.from_dict(dict(nx.closeness_centrality(g)), columns = ['closeness_centrality'],orient='index')
    #c = pd.DataFrame.from_dict(dict(nx.eigenvector_centrality(g, max_iter=1000)), columns = ['eigenvector_centrality'],orient='index')
    a = pd.DataFrame.from_dict(dict(nx.degree_centrality(g)), columns = ['degree_centrality'],orient='index')
    a['date_time'] = name
    a['polygon_id'] = a.index
    #a.set_index(a.date_time, inplace=True)
    #a.drop('date_time', axis=1, inplace=True)
    
    d = pd.DataFrame.from_dict(dict(nx.betweenness_centrality(g)), columns = ['betweenness_centrality'],orient='index')
    d['date_time'] = name
    d['polygon_id'] = d.index
    #d.set_index(d.date_time, inplace=True)
    #d.drop('date_time', axis=1, inplace=True)
    
    b = pd.DataFrame.from_dict(dict(nx.closeness_centrality(g)), columns = ['closeness_centrality'],orient='index')
    b['date_time'] = name
    b['polygon_id'] = b.index
    #b.set_index(b.date_time, inplace=True)
    #b.drop('date_time', axis=1, inplace=True)
    
    c = pd.DataFrame.from_dict(dict(nx.eigenvector_centrality(g, max_iter=1000)), columns = ['eigenvector_centrality'],orient='index')
    c['date_time'] = name
    c['polygon_id'] = c.index
    #c.set_index(c.date_time, inplace=True)
    #c.drop('date_time', axis=1, inplace=True)
  
    #merge stats
    a = pd.merge(a,b,how='left',on=['date_time','polygon_id']) 
    a = pd.merge(a,c,how='left',on=['date_time','polygon_id']) 
    a = pd.merge(a,d,how='left',on=['date_time','polygon_id']) 
    print((a.head(3)))
    
    graph_stats  = graph_stats.append([a,b,c,d])

2020-03-05 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.055556 2020-03-05      162809              0.237467   
1           0.044444 2020-03-05      143604              0.284810   
2           0.055556 2020-03-05      138788              0.276074   

   eigenvector_centrality  betweenness_centrality  
0                0.008461                0.068870  
1                0.046841                0.119072  
2                0.063036                0.118739  
2020-03-05 08:00:00


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.097826 2020-03-05 08:00:00      155291              0.423963   
1           0.065217 2020-03-05 08:00:00      155290              0.321678   
2           0.043478 2020-03-05 08:00:00      145782              0.313993   

   eigenvector_centrality  betweenness_centrality  
0                0.079600                0.136771  
1                0.010654                0.037962  
2                0.009700                0.007267  
2020-03-05 16:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.053763 2020-03-05 16:00:00      162818              0.389121   
1           0.096774 2020-03-05 16:00:00      160251              0.394068   
2           0.043011 2020-03-05 16:00:00      151062              0.317406   

   eigenvector_centrality  betweenness_centrality  
0                0.064382                0.013204  
1                0.165428                0

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.043011 2020-03-10 16:00:00      141790              0.315254   
1           0.032258 2020-03-10 16:00:00      155288              0.306931   
2           0.118280 2020-03-10 16:00:00      166634              0.389121   

   eigenvector_centrality  betweenness_centrality  
0                0.012159                0.001593  
1                0.009168                0.000610  
2                0.193359                0.011918  
2020-03-11 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.044944 2020-03-11      162808              0.182752   
1           0.044944 2020-03-11      162809              0.206497   
2           0.056180 2020-03-11      155294              0.257225   

   eigenvector_centrality  betweenness_centrality  
0                0.000580                0.034661  
1                0.003215                0.024945  
2                0.013004 

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.043011 2020-03-16 16:00:00      162807              0.303922   
1           0.043011 2020-03-16 16:00:00      145779              0.302932   
2           0.064516 2020-03-16 16:00:00      153027              0.372000   

   eigenvector_centrality  betweenness_centrality  
0                0.013096                0.000728  
1                0.012889                0.000950  
2                0.044342                0.037842  
2020-03-17 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.034884 2020-03-17      145779              0.143438   
1           0.023256 2020-03-17      145778              0.141949   
2           0.023256 2020-03-17      151060              0.189922   

   eigenvector_centrality  betweenness_centrality  
0                0.000301                0.007601  
1                0.000208                0.005223  
2                0.002536 

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.034483 2020-03-22 16:00:00      162808              0.038793   
1           0.034483 2020-03-22 16:00:00      162813              0.038793   
2           0.034483 2020-03-22 16:00:00      132360              0.092563   

   eigenvector_centrality  betweenness_centrality  
0            2.697415e-09                0.001210  
1            2.697415e-09                0.001210  
2            7.221250e-06                0.021779  
2020-03-23 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.014493 2020-03-23      151058              0.019324   
1           0.028986 2020-03-23      151057              0.028986   
2           0.014493 2020-03-23      162816              0.021739   

   eigenvector_centrality  betweenness_centrality  
0            8.259940e-12                0.000000  
1            1.168132e-11                0.000426  
2            3.541936e-11 

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.034483 2020-03-29 16:00:00      162813              0.034483   
1           0.017241 2020-03-29 16:00:00      162816              0.022989   
2           0.017241 2020-03-29 16:00:00      153025              0.022989   

   eigenvector_centrality  betweenness_centrality  
0            1.323341e-09                0.000605  
1            9.357436e-10                0.000000  
2            9.357436e-10                0.000000  
2020-03-30 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.012346 2020-03-30      132357              0.053467   
1           0.024691 2020-03-30      132358              0.064646   
2           0.024691 2020-03-30      157442              0.081737   

   eigenvector_centrality  betweenness_centrality  
0            8.460514e-08                0.000000  
1            3.050060e-07                0.007099  
2            3.805505e-06 

   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.028986 2020-04-04      132360              0.058172   
1           0.014493 2020-04-04      132356              0.047596   
2           0.014493 2020-04-04      162816              0.019324   

   eigenvector_centrality  betweenness_centrality  
0            4.435742e-07                 0.00682  
1            1.339939e-07                 0.00000  
2            7.468204e-11                 0.00000  
2020-04-04 08:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.025974 2020-04-04 08:00:00      151057              0.025974   
1           0.012987 2020-04-04 08:00:00      151058              0.017316   
2           0.038961 2020-04-04 08:00:00      162808              0.078929   

   eigenvector_centrality  betweenness_centrality  
0            1.582944e-11                0.000342  
1            1.119310e-11                0.000000  
2            5.414419e-05 

2020-04-09 08:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.022472 2020-04-09 08:00:00      160253              0.131075   
1           0.056180 2020-04-09 08:00:00      162813              0.136086   
2           0.044944 2020-04-09 08:00:00      132359              0.101598   

   eigenvector_centrality  betweenness_centrality  
0                0.002581                0.027955  
1                0.000935                0.051621  
2                0.000004                0.087845  
2020-04-09 16:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.043956 2020-04-09 16:00:00      155291              0.154237   
1           0.021978 2020-04-09 16:00:00      155290              0.134815   
2           0.043956 2020-04-09 16:00:00      151225              0.161348   

   eigenvector_centrality  betweenness_centrality  
0                0.000012                0.208913  
1                0.000

2020-04-14 16:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.043956 2020-04-14 16:00:00      132359              0.137462   
1           0.021978 2020-04-14 16:00:00      132360              0.121333   
2           0.065934 2020-04-14 16:00:00      157440              0.165455   

   eigenvector_centrality  betweenness_centrality  
0            5.654573e-06                0.085958  
1            9.146966e-07                0.021978  
2            3.170255e-04                0.083957  
2020-04-15 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.024096 2020-04-15      157441              0.073814   
1           0.060241 2020-04-15      157440              0.097189   
2           0.036145 2020-04-15      162808              0.059503   

   eigenvector_centrality  betweenness_centrality  
0            2.775464e-06                0.000950  
1            7.714594e-06                0.023450  
2     

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.044444 2020-04-20 08:00:00      132359              0.121294   
1           0.033333 2020-04-20 08:00:00      132358              0.108827   
2           0.066667 2020-04-20 08:00:00      157440              0.140187   

   eigenvector_centrality  betweenness_centrality  
0            1.123118e-06                0.085893  
1            3.706582e-07                0.022222  
2            1.108425e-05                0.055818  
2020-04-20 16:00:00
   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.043956 2020-04-20 16:00:00      155291              0.153716   
1           0.021978 2020-04-20 16:00:00      155290              0.135015   
2           0.065934 2020-04-20 16:00:00      157440              0.165154   

   eigenvector_centrality  betweenness_centrality  
0                0.000015                0.235230  
1                0.000002                0

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.035714 2020-04-25 16:00:00      138785              0.088353   
1           0.023810 2020-04-25 16:00:00      141765              0.096559   
2           0.035714 2020-04-25 16:00:00      157442              0.146339   

   eigenvector_centrality  betweenness_centrality  
0            1.268264e-08                0.046185  
1            8.286260e-08                0.067986  
2            5.266490e-05                0.023236  
2020-04-26 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.033898 2020-04-26      162813              0.033898   
1           0.033898 2020-04-26      162816              0.033898   
2           0.033898 2020-04-26      162814              0.033898   

   eigenvector_centrality  betweenness_centrality  
0            3.338152e-09                     0.0  
1            3.338152e-09                     0.0  
2            3.338152e-09 

   degree_centrality           date_time  polygon_id  closeness_centrality  \
0           0.013333 2020-05-01 16:00:00      141782              0.017778   
1           0.026667 2020-05-01 16:00:00      155289              0.026667   
2           0.026667 2020-05-01 16:00:00      157442              0.081073   

   eigenvector_centrality  betweenness_centrality  
0            2.057196e-12                0.000000  
1            2.909315e-12                0.000360  
2            4.285058e-06                0.002793  
2020-05-02 00:00:00
   degree_centrality  date_time  polygon_id  closeness_centrality  \
0           0.042857 2020-05-02      162813              0.057143   
1           0.042857 2020-05-02      162808              0.051429   
2           0.028571 2020-05-02      138785              0.032653   

   eigenvector_centrality  betweenness_centrality  
0            1.256821e-08                0.003727  
1            8.168013e-09                0.003727  
2            1.435698e-10 

In [16]:

graph_stats.head()

Unnamed: 0,betweenness_centrality,closeness_centrality,date_time,degree_centrality,eigenvector_centrality,polygon_id
0,0.06887,0.237467,2020-03-05,0.055556,0.008461,162809
1,0.119072,0.28481,2020-03-05,0.044444,0.046841,143604
2,0.118739,0.276074,2020-03-05,0.055556,0.063036,138788
3,0.073655,0.226131,2020-03-05,0.022222,0.007008,138791
4,0.087847,0.190275,2020-03-05,0.055556,0.000682,132359


In [17]:
graph_stats.to_csv('raw_graph_stats_dataframe.csv')

In [18]:
graph_stats = pd.read_csv('raw_graph_stats_dataframe.csv', index_col = 0)
graph_stats['date_time'] = pd.to_datetime(graph_stats['date_time'])
graph_stats['time']=graph_stats['date_time'].apply(lambda x: x.time())
graph_stats['date']=graph_stats['date_time'].apply(lambda x: x.date())

In [19]:
graph_stats.shape

(62364, 8)

In [20]:
graph_stats.columns

Index(['betweenness_centrality', 'closeness_centrality', 'date_time',
       'degree_centrality', 'eigenvector_centrality', 'polygon_id', 'time',
       'date'],
      dtype='object')

In [21]:
grab_codes = pd.read_csv('cities.csv', sep=';')
grab_codes


Unnamed: 0,lieu,polygon_id,feature_id,level,geometry,code_insee,nom,nuts3,wikipedia,surf_km2
0,Nord-Pas-de-Calais-Picardie // Nord // La Sent...,163070,12500000033147,LEVEL4,POINT (3.4756241481364 50.348637183764),59,Nord,FR301,fr:Nord (département),5750.0
1,Nord-Pas-de-Calais-Picardie // Nord // Bermerain,163311,12500000033389,LEVEL4,POINT (3.539229862293201 50.257497316369),59,Nord,FR301,fr:Nord (département),5750.0
2,Nord-Pas-de-Calais-Picardie // Nord // Quiévre...,163365,12500000033443,LEVEL4,POINT (3.6595577566949 50.392605960358),59,Nord,FR301,fr:Nord (département),5750.0
3,Nord-Pas-de-Calais-Picardie // Nord // Liessies,163610,12500000033692,LEVEL4,POINT (4.0773572675176 50.10443159667599),59,Nord,FR301,fr:Nord (département),5750.0
4,Nord-Pas-de-Calais-Picardie // Nord // Hargnies,163698,12500000033781,LEVEL4,POINT (3.8413200463043 50.257770646938),59,Nord,FR301,fr:Nord (département),5750.0
5,Nord-Pas-de-Calais-Picardie // Nord // Oxelaëre,165282,12500000035382,LEVEL4,POINT (2.469967845049799 50.7802962491),59,Nord,FR301,fr:Nord (département),5750.0
6,Nord-Pas-de-Calais-Picardie // Nord // Montign...,166100,12500000036204,LEVEL4,POINT (3.4142862750207 50.091306551707),59,Nord,FR301,fr:Nord (département),5750.0
7,Nord-Pas-de-Calais-Picardie // Nord // Râches,164027,12500000034112,LEVEL4,POINT (3.1369228226729 50.420186056564),59,Nord,FR301,fr:Nord (département),5750.0
8,Nord-Pas-de-Calais-Picardie // Nord // Cobrieux,164883,12500000034976,LEVEL4,POINT (3.2398593168987 50.539156116974),59,Nord,FR301,fr:Nord (département),5750.0
9,Nord-Pas-de-Calais-Picardie // Nord // Bantouz...,165609,12500000035713,LEVEL4,POINT (3.2246416553447 50.05878960819),59,Nord,FR301,fr:Nord (département),5750.0


In [22]:
first = grab_codes[['polygon_id','code_insee']]
graph_stats = pd.merge(graph_stats,first, how = 'left',on = 'polygon_id')

We now have a data structure where we have centrality measures for each polygon and time slice. And for each polygon, we know to which department it belongs to.

In [23]:
#graph_stats.drop(['code_insee_x','code_insee_y'], axis=1, inplace = True)
graph_stats.head()

Unnamed: 0,betweenness_centrality,closeness_centrality,date_time,degree_centrality,eigenvector_centrality,polygon_id,time,date,code_insee
0,0.06887,0.237467,2020-03-05,0.055556,0.008461,162809,00:00:00,2020-03-05,49
1,0.119072,0.28481,2020-03-05,0.044444,0.046841,143604,00:00:00,2020-03-05,37
2,0.118739,0.276074,2020-03-05,0.055556,0.063036,138788,00:00:00,2020-03-05,51
3,0.073655,0.226131,2020-03-05,0.022222,0.007008,138791,00:00:00,2020-03-05,52
4,0.087847,0.190275,2020-03-05,0.055556,0.000682,132359,00:00:00,2020-03-05,13


We will regroup these statistics to have mean values per depatment and date time. These values will become features for our prediction pipeline.

In [24]:
graph_stats = graph_stats.groupby(['date_time','code_insee'], as_index = False).agg({
    'betweenness_centrality':'mean',
    'closeness_centrality':'mean',
    'degree_centrality':'mean',
    'eigenvector_centrality':'mean'
})

In [25]:
graph_stats.head()

Unnamed: 0,date_time,code_insee,betweenness_centrality,closeness_centrality,degree_centrality,eigenvector_centrality
0,2020-03-05,1,0.063311,0.25641,0.066667,0.017231
1,2020-03-05,2,0.005806,0.273556,0.066667,0.130749
2,2020-03-05,3,0.000653,0.206422,0.022222,0.002991
3,2020-03-05,4,0.022222,0.16129,0.033333,8.5e-05
4,2020-03-05,5,0.0,0.139104,0.011111,9e-06


In [26]:
graph_stats.to_csv('centrality_stats.csv')