<a id='summary'></a>
# Creating transition matrices from unanonymized data

## Summary

* [Loading GIS](#gis)
* [Loading Orange OD-matrices](#odm)


* [Snapping ODs to the census GIS](#snap_gis)
* [Creating transition matrix](#trans_matrix)
* [Exporting](#export)


* [Getting vol by timestep for a typical day](#vol_by_timestep)
    * [Exporting vol by timestep](#export_vol)


* [Sanity checks](#sanity)

In [3]:
import datetime
import geopandas as gpd
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
import numpy as np
import os
import pandas as pd

from utils.chrono import Chrono

with open('config.json', 'r') as config_path:
    config = json.load(config_path)
    
timesteps = config['timesteps']


<a id='gis'></a>
# Loading GIS
* [Back to summary](#summary)

In [4]:
gis_map_path = os.path.join(config['outdata_dir']['path'], 
                         config['outdata_dir']['gis_map_filename'])

census_gis = gpd.read_file(gis_map_path)
census_gis = census_gis.set_crs('epsg:3035').drop(columns=['wkt'])
census_gis.head()

Unnamed: 0,iris_or_commune,frac_area,is_iris,commune,x,y,nearest_neighbor_0,nearest_neighbor_1,nearest_neighbor_2,nearest_neighbor_3,...,nearest_neighbor_11,nearest_neighbor_12,nearest_neighbor_13,nearest_neighbor_14,nearest_neighbor_15,nearest_neighbor_16,nearest_neighbor_17,nearest_neighbor_18,nearest_neighbor_19,geometry
0,1043,1.0,False,1043,3932172.925186967,2538222.734103135,692820201,692820103,692560101,692860101,...,692790101,692820601,1249,1276,1424,1262,1418,1049,1376,"POLYGON ((3931567.736 2535603.294, 3931455.673..."
1,1049,0.903,False,1049,3934407.726084904,2538725.650855867,692820103,692820201,1275,692750111,...,1249,692820601,692790101,1424,1376,1418,1276,1043,1262,"POLYGON ((3931819.419 2541777.411, 3931843.795..."
2,1249,0.901,False,1249,3927976.919936828,2538778.845483301,692860102,69292,692860501,692860201,...,692860502,692860302,1043,692860303,692860304,1424,692860101,1376,1275,"POLYGON ((3930627.132 2534580.113, 3930588.941..."
3,1262,0.161,False,1262,3934403.885727907,2540983.4562672246,692820201,692820204,692820104,692860101,...,692820601,692790101,1249,1418,1276,1376,1424,1043,1049,"POLYGON ((3931728.804 2541776.577, 3931683.836..."
4,1275,1.0,False,1275,3926871.854005055,2536284.03097829,692560202,1376,692860501,692560302,...,692860502,1249,692860401,692860301,692560101,692860302,692860101,692860304,692860303,"POLYGON ((3926169.735 2534938.133, 3926094.500..."


<a id='odm'></a>
# Loading Orange OD-matrices
* [Back to summary](#summary)

In [6]:
# 00:20:00
c = Chrono('Loading...')
odm_path = os.path.join(config['data_dir']['path'], config['data_dir']['odm_filename'])

odm = pd.read_csv(odm_path)

c.done()
odm.head()


17:08:56	Loading...
00:00:54	Preprocess...
00:13:32	Cut...
00:13:36	Work complete !


Unnamed: 0,date,hour,ori,dest,w,ox,oy,dx,dy
0,2019-03-17,0,"(3904000, 2512400)","(3904000, 2512400)",77.629852,3904000,2512400,3904000,2512400
1,2019-03-17,0,"(3904000, 2512400)","(3904000, 2512600)",1.735704,3904000,2512400,3904000,2512600
2,2019-03-17,0,"(3904000, 2512400)","(3904200, 2512400)",100.382446,3904000,2512400,3904200,2512400
3,2019-03-17,0,"(3904000, 2512400)","(3904200, 2512600)",12.998931,3904000,2512400,3904200,2512600
4,2019-03-17,0,"(3904000, 2512400)","(3904200, 2512800)",5.980422,3904000,2512400,3904200,2512800


In [5]:
odm = pd.read_csv(odm_path, compression='gzip')

In [6]:
print(len(odm))

49681883


In [7]:
odm.to_csv('../../data/OD_matrix_lyon_unanon/OD_matrix_resident_2.csv', compression='gzip')

In [10]:
# For sanity check purposes
vol_by_dayhour_complete = odm.groupby(['date', 'hour'])['w'].sum().reset_index()

vol_by_dayhour_complete_path = os.path.join(config['outdata_dir']['path'], 
                                            config['outdata_dir']['vol_by_dayhour_complete_filename'])
vol_by_dayhour_complete.to_csv(vol_by_dayhour_complete_path, index=False)
vol_by_dayhour_complete

Unnamed: 0,date,hour,w
0,2019-03-17,0,199278.156593
1,2019-03-17,1,61703.520178
2,2019-03-17,2,46596.259498
3,2019-03-17,3,32395.210443
4,2019-03-17,4,8768.772318
...,...,...,...
2202,2019-06-16,19,2106.766692
2203,2019-06-16,20,1206.525516
2204,2019-06-16,21,872.526052
2205,2019-06-16,22,288.607800


<a id='snap_gis'></a>
# Snapping OD to the census GIS
* [Back to summary](#summary)

In [11]:
c=Chrono('creating geom o...')
geometry_o = odm.groupby(['ox','oy']).size().reset_index()
geometry_o['geometry'] = gpd.points_from_xy(geometry_o['ox'], geometry_o['oy'])
geometry_o = gpd.GeoDataFrame(geometry_o).set_crs('epsg:3035')
c.tprint('{} rows'.format(len(geometry_o)))

c.tprint('Finding iris o...')

geometry_o = (geometry_o
       .sjoin(census_gis, how="inner")
       .rename(columns={'iris_or_commune':'o'})
      )
c.tprint('{} rows'.format(len(geometry_o)))

c.tprint('Setting iris o...')

odm = odm.merge(geometry_o[['ox', 'oy', 'o']], on=['ox','oy'])

c.done()
odm.head()

17:39:04	creating geom o...
00:00:07	33088 rows
00:00:07	Finding iris o...
00:00:07	16512 rows
00:00:07	Setting iris o...
00:01:02	Work complete !


Unnamed: 0,date,hour,ori,dest,w,ox,oy,dx,dy,o
0,2019-03-17,0,"(3910400, 2533600)","(3912400, 2536000)",3.702318,3910400,2533600,3912400,2536000,69127
1,2019-03-17,8,"(3910400, 2533600)","(3911800, 2534600)",2.020423,3910400,2533600,3911800,2534600,69127
2,2019-03-17,9,"(3910400, 2533600)","(3910400, 2533600)",5.279274,3910400,2533600,3910400,2533600,69127
3,2019-03-17,9,"(3910400, 2533600)","(3911200, 2532800)",1.906925,3910400,2533600,3911200,2532800,69127
4,2019-03-17,9,"(3910400, 2533600)","(3916000, 2533400)",1.868867,3910400,2533600,3916000,2533400,69127


In [12]:
c=Chrono('creating geom d...')
geometry_d = odm.groupby(['dx','dy']).size().reset_index()
geometry_d['geometry'] = gpd.points_from_xy(geometry_d['dx'], geometry_d['dy'])
geometry_d = gpd.GeoDataFrame(geometry_d).set_crs('epsg:3035')

c.tprint('Finding iris d...')
geometry_d = (geometry_d
       .sjoin(census_gis, how="inner")
       .rename(columns={'iris_or_commune':'d'})
      )
c.write('{} rows'.format(len(geometry_o)))

c.tprint('Setting iris d...')

odm = odm.merge(geometry_d[['dx', 'dy', 'd']], on=['dx','dy'])

c.done()

17:40:06	creating geom d...
00:00:04	Finding iris d...
00:00:04	16512 rows
00:00:04	Setting iris d...
00:01:25	Work complete !


In [13]:
odm['w'].sum()

141093355.1187346

In [14]:
# For sanity check purposes
vol_by_dayhour_filtered = odm.groupby(['date', 'hour'])['w'].sum().reset_index()

vol_by_dayhour_filtered_path = os.path.join(config['outdata_dir']['path'], 
                                            config['outdata_dir']['vol_by_dayhour_filtered_filename'])

vol_by_dayhour_filtered.to_csv(vol_by_dayhour_filtered_path, index=False)


<a id='trans_matrix'></a>
# Creating transition matrix
* [Back to summary](#summary)

In [15]:
# 00:01:12
holidays = ['2019-04-22', # lundi de pâques
            '2019-05-01', # fête du travail
            '2019-05-08', # armistice 1945
            '2019-05-30', # ascension
            '2019-05-31', # pont de l'ascension
            '2019-06-10'  # lundi de pentecôte
           ]

c=Chrono('Grouping...')
transition_matrix = odm.groupby(['o','d', 'date', 'hour'])['w'].sum().reset_index()

c.tprint('Parsing to date...')
transition_matrix['date'] = pd.to_datetime(transition_matrix['date'])

c.tprint('Filtering out weekends and holidays...')
transition_matrix = transition_matrix[(transition_matrix['date'].dt.weekday<5)&
                                      (~transition_matrix['date'].isin(holidays))]

nb_days = len(transition_matrix['date'].unique())


c.tprint('Re-grouping...')
transition_matrix = transition_matrix.groupby(['o', 'd', 'hour'])['w'].sum().reset_index()
transition_matrix['w'] = transition_matrix['w']/nb_days  # vol for a typical day

c.tprint('Fitting hour to timesteps...')
transition_matrix['t'] = timesteps[np.searchsorted(timesteps, transition_matrix['hour'], side='right')-1]
transition_matrix = transition_matrix.groupby(['o', 'd', 't'])['w'].sum().reset_index()

c.tprint('Renaming...')
transition_matrix = transition_matrix.rename(columns={'w':'vol'})

c.tprint('Getting P(d | o, t)...')
transition_matrix['potential_d'] = transition_matrix.groupby(['o', 't'])['vol'].transform(sum)
transition_matrix['proba_d'] = transition_matrix['vol']/transition_matrix['potential_d']
transition_matrix = transition_matrix.drop(columns=['potential_d'])

c.tprint('Getting P(o | d, t)...')
transition_matrix['potential_o'] = transition_matrix.groupby(['d', 't'])['vol'].transform(sum)
transition_matrix['proba_o'] = transition_matrix['vol']/transition_matrix['potential_o']
transition_matrix = transition_matrix.drop(columns=['potential_o'])

c.done()

17:41:39	Grouping...
00:00:41	Parsing to date...
00:00:45	Filtering out weekends and holidays...
00:00:48	Re-grouping...
00:00:55	Fitting hour to timesteps...
00:00:56	Renaming...
00:00:56	Getting P(d | o, t)...
00:00:56	Getting P(o | d, t)...
00:00:57	Work complete !


In [16]:
print(nb_days)

59


In [15]:
transition_matrix['vol'].sum()

1787219.0412896478

<a id='export'></a>
# Exporting
* [Back to summary](#summary)

In [22]:
outpath = os.path.join(config['data_dir']['path'], config['data_dir']['transition_matrix_filename'])
print(datetime.datetime.now())
print('Exported to:')
print(out_path)
transition_matrix.to_csv(outpath, index=False)



In [3]:
# Reading example
import pandas as pd
transition_matrix = pd.read_csv('/Users/benoit/Desktop/Pro/210526-fusion/outdata/iris_Mod_unanon.csv')
transition_matrix.head()
transition_matrix.groupby('t').size()/len(census_gis)**2

t
0     0.282663
2     0.201714
5     0.455297
7     0.560053
8     0.565450
9     0.517302
10    0.623882
12    0.645123
14    0.652130
16    0.590460
17    0.603146
18    0.589427
19    0.546807
20    0.541272
22    0.484443
dtype: float64

<a id='vol_by_timestep'></a>
# Getting volumes by timestep for a typical day
* [Back to summary](#summary)

In [18]:
trips_by_hour = transition_matrix.groupby('t')['vol'].sum().reset_index()
trips_by_hour['vol'].sum()

<a id='export_vol'></a>
## Exporting `vol_by_timestep`
* [Back to summary](#summary)

In [20]:
trips_by_hour_path = os.path.join(config['outdata_dir']['path'], 
                                  config['outdata_dir']['trips_by_hour_filename'])


trips_by_hour.to_csv(trips_by_hour_path, index=False)


Unnamed: 0,t,vol
0,0,25930.507394
1,2,14636.049557
2,5,67479.22141
3,7,122980.319828
4,8,144446.695012
5,9,93518.942559
6,10,180189.779558
7,12,225904.491855
8,14,199975.603515
9,16,127340.717484


<a id='sanity'></a>
# Sanity checks
* [Back to summary](#summary)

## Re-computing trips_by_hour from raw data

In [21]:
trips_by_hour_verif = (odm.rename(columns={'w':'vol'})
                     .groupby(['date', 'hour'])['vol'].sum()
                     .reset_index()
                    )

trips_by_hour_verif['date'] = pd.to_datetime(trips_by_hour_verif['date'])
trips_by_hour_verif = (trips_by_hour_verif[trips_by_hour_verif['date'].dt.dayofweek<=5]
                     .groupby('hour').sum()
                     .reset_index()
                    )
trips_by_hour_verif['vol'] = trips_by_hour['vol']/nb_days  # not equivalent to using mean as agg function because 0-flows don't appear in the data

trips_by_hour_verif['t'] = timesteps[np.searchsorted(timesteps, trips_by_hour_verif['hour'], side='right')-1]
trips_by_hour_verif = trips_by_hour_verif.groupby('t')['vol'].sum().reset_index()
trips_by_hour_verif

Unnamed: 0,t,vol
0,0,687.568762
1,2,5676.376886
2,5,4639.130883
3,7,3828.889692
4,8,3389.417009
5,9,2158.317245
6,10,5065.853303
7,12,3622.130406
8,14,1224.163971
9,16,0.0
