# Config


## Imports


In [3]:
import pandas as pd
import numpy as np
import networkx as nx

from pyvis.network import Network

## Constants


In [4]:
DATASETS_PATH = "../data/"
GRAPHS_PATH = "../graphs/"

# Data vizualisation


In [5]:
timetables = pd.read_csv(DATASETS_PATH + "timetables.csv", sep="\t")
timetables

Unnamed: 0,trip_id,trajet,duree
0,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138
1,OCESN003190F040047309,Gare de Dieppe - Gare de Paris-St-Lazare,145
2,OCESN003198F030037315,Gare de Paris-St-Lazare - Gare de Rouen-Rive-D...,97
3,OCESN003300F030037323,Gare de Cherbourg - Gare de Paris-St-Lazare,194
4,OCESN003313F380387526,Gare de Caen - Gare de Paris-St-Lazare,149
...,...,...,...
1570,OCESN895822F0500552575,Gare de Belfort-Ville - Gare de Lyon-Perrache,244
1571,OCESN895830F0200252600,Gare de Lons-le-Saunier - Gare de Lyon-Perrache,103
1572,OCESN895880F0500552634,Gare de Belfort-Ville - Gare de Lons-le-Saunier,144
1573,OCESN895940F0200252654,Gare de Besançon-Viotte - Gare de Lons-le-Saunier,89


## Basic graph


In [6]:
timetables['source'] = timetables['trajet'].map(lambda x: x.split(" - ")[0])
timetables['target'] = timetables['trajet'].map(lambda x: x.split(" - ")[1])

In [7]:
graph = nx.from_pandas_edgelist(df=timetables, edge_attr="duree")

In [8]:
net = Network(height="100vh")
net.from_nx(graph)
net.show(GRAPHS_PATH + "basic_graph.html")

## Merge datasets


In [9]:
stops_df = pd.read_csv(DATASETS_PATH + 'stops.txt')
stop_times_df = pd.read_csv(DATASETS_PATH + 'stop_times.txt')

In [10]:
stops_df.head()

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,StopArea:OCE87381509,Gare de Mantes-la-Jolie,,48.989687,1.703294,,,1,
1,StopArea:OCE87415604,Gare de Vernon-Giverny,,49.091286,1.478363,,,1,
2,StopArea:OCE87415620,Gare de Gaillon-Aubevoye,,49.174632,1.352518,,,1,
3,StopArea:OCE87415877,Gare de Val-de-Reuil,,49.275399,1.224609,,,1,
4,StopArea:OCE87411207,Gare de Oissel,,49.343042,1.101821,,,1,


In [11]:
stop_times_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,OCESN037071R0100119847,23:05:00,23:05:00,StopPoint:OCECar TER-87381509,0,,0,0,
1,OCESN037071R0100119847,23:35:00,23:35:00,StopPoint:OCECar TER-87415604,1,,0,0,
2,OCESN037071R0100119847,23:55:00,23:55:00,StopPoint:OCECar TER-87415620,2,,0,0,
3,OCESN037071R0100119847,24:25:00,24:25:00,StopPoint:OCECar TER-87415877,3,,0,0,
4,OCESN037071R0100119847,24:45:00,24:45:00,StopPoint:OCECar TER-87411207,4,,0,0,


In [12]:
train_station_df = pd.concat([
    stop_times_df['trip_id'], stop_times_df['departure_time'],
    stop_times_df['stop_id']
],
                             axis=1)
train_station_df = train_station_df.join(
    stops_df.set_index('stop_id')['stop_name'], on='stop_id')
train_station_df.sort_values(['stop_name', 'departure_time'], inplace=True)
train_station_df

Unnamed: 0,trip_id,departure_time,stop_id,stop_name
43888,OCESN035302R0200219380,05:55:00,StopPoint:OCECar TER-87191403,Aboncourt
43910,OCESN035310R0200219391,06:44:00,StopPoint:OCECar TER-87191403,Aboncourt
43687,OCESN035277R0200219357,07:20:00,StopPoint:OCECar TER-87191403,Aboncourt
43987,OCESN035304R0200219383,07:58:00,StopPoint:OCECar TER-87191403,Aboncourt
44049,OCESN035304R0300319382,07:58:00,StopPoint:OCECar TER-87191403,Aboncourt
...,...,...,...,...
36956,OCESN031254R0300318525,13:06:00,StopPoint:OCECar TER-87110197,vely
55074,OCESN052632R0200220780,07:25:00,StopPoint:OCECar TER-00,
88440,OCESN406404R0100133525,13:08:00,StopPoint:OCECar TER-87407528,
88429,OCESN406401R0100133522,13:25:00,StopPoint:OCECar TER-87407528,


In [101]:
extended_timetables = timetables.join(train_station_df.set_index('trip_id'),
                                      on='trip_id')
extended_timetables.sort_values(["trip_id", "departure_time"], inplace=True)
extended_timetables.reset_index(inplace=True, drop=True)
extended_timetables

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time,stop_id,stop_name
0,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138,Gare de Le Havre,Gare de Paris-St-Lazare,05:20:00,StopPoint:OCETrain TER-87413013,Gare de Le Havre
1,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138,Gare de Le Havre,Gare de Paris-St-Lazare,05:36:00,StopPoint:OCETrain TER-87413344,Gare de Bréauté-Beuzeville
2,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138,Gare de Le Havre,Gare de Paris-St-Lazare,05:50:00,StopPoint:OCETrain TER-87413385,Gare de Yvetot
3,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138,Gare de Le Havre,Gare de Paris-St-Lazare,06:15:00,StopPoint:OCETrain TER-87411017,Gare de Rouen-Rive-Droite
4,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,138,Gare de Le Havre,Gare de Paris-St-Lazare,07:38:00,StopPoint:OCETrain TER-87384008,Gare de Paris-St-Lazare
...,...,...,...,...,...,...,...,...
13045,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,81,Gare de Bourg-en-Bresse,Gare de Mouchard,13:55:00,StopPoint:OCETrain TER-87718197,Gare de St-Lothain
13046,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,81,Gare de Bourg-en-Bresse,Gare de Mouchard,14:03:00,StopPoint:OCETrain TER-87718213,Gare de Domblans-Voiteur
13047,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,81,Gare de Bourg-en-Bresse,Gare de Mouchard,14:13:00,StopPoint:OCETrain TER-87718239,Gare de Lons-le-Saunier
13048,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,81,Gare de Bourg-en-Bresse,Gare de Mouchard,14:34:00,StopPoint:OCETrain TER-87718296,Gare de St-Amour


# Preprocess extended timetables

## Remove NaN values

In [102]:
extended_timetables.isna().value_counts()

trip_id  trajet  duree  source  target  departure_time  stop_id  stop_name
False    False   False  False   False   False           False    False        13048
                                                                 True             2
dtype: int64

In [103]:
nan_trip_ids = extended_timetables[
    extended_timetables['stop_name'].isna()]['trip_id']
extended_timetables.loc[nan_trip_ids.index]

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time,stop_id,stop_name
4965,OCESN052632R0200220780,Bourges-Gare-Routière - nan,65,Bourges-Gare-Routière,,07:25:00,StopPoint:OCECar TER-00,
6702,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,15:08:00,StopPoint:OCECar TER-87407528,


In [104]:
extended_timetables.loc[extended_timetables['trip_id'].isin(nan_trip_ids)]

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time,stop_id,stop_name
4965,OCESN052632R0200220780,Bourges-Gare-Routière - nan,65,Bourges-Gare-Routière,,07:25:00,StopPoint:OCECar TER-00,
4966,OCESN052632R0200220780,Bourges-Gare-Routière - nan,65,Bourges-Gare-Routière,,07:45:00,StopPoint:OCECar TER-87576298,Gare de Châteauneuf-sur-Cher
4967,OCESN052632R0200220780,Bourges-Gare-Routière - nan,65,Bourges-Gare-Routière,,08:25:00,StopPoint:OCECar TER-87576207,Gare de Bourges
4968,OCESN052632R0200220780,Bourges-Gare-Routière - nan,65,Bourges-Gare-Routière,,08:30:00,StopPoint:OCECar TER-87454249,Bourges-Gare-Routière
6697,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,12:42:00,StopPoint:OCECar TER-87734004,Gare de Clermont-Ferrand
6698,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,13:07:00,StopPoint:OCECar TER-87734053,Gare de Riom-Châtel-Guyon
6699,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,14:07:00,StopPoint:OCECar TER-87732008,Gare de Vichy
6700,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,14:32:00,StopPoint:OCECar TER-87732206,Gare de St-Germain-des-Fossés
6701,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,14:52:00,StopPoint:OCECar TER-87696351,Gare de Varennes-sur-Allier
6702,OCESN406400R0200233521,Gare de Clermont-Ferrand - Gare de Moulins-sur...,165,Gare de Clermont-Ferrand,Gare de Moulins-sur-Allier,15:08:00,StopPoint:OCECar TER-87407528,


In [105]:
extended_timetables.dropna(inplace=True)

## Adjust time delta, sources and targets

In [111]:
trip_ids = extended_timetables['trip_id'].unique()

In [121]:
cp = extended_timetables.copy()

cp['duree'] = pd.to_timedelta(cp['departure_time']).dt.total_seconds() / 60
durees = [cp[cp['trip_id'] == id]['duree'].diff() for id in trip_ids]
cp['duree'] = np.concatenate(durees)

departure_times = [
    np.append(np.nan, cp[cp['trip_id'] == id]['departure_time'].iloc[:-1])
    for id in trip_ids
]
cp['departure_time'] = np.concatenate(departure_times)

sources = [
    np.append(np.nan, cp[cp['trip_id'] == id]['stop_name'].iloc[:-1])
    for id in trip_ids
]
cp['source'] = np.concatenate(sources)
targets = [
    np.append(np.nan, cp[cp['trip_id'] == id]['stop_name'].iloc[1:])
    for id in trip_ids
]
cp['target'] = np.concatenate(targets)

cp.dropna(inplace=True)
cp.drop(['stop_id', 'stop_name'], axis=1, inplace=True)
cp['duree'] = cp['duree'].astype(int)
cp

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time
1,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,16,Gare de Le Havre,Gare de Bréauté-Beuzeville,05:20:00
2,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,14,Gare de Bréauté-Beuzeville,Gare de Yvetot,05:36:00
3,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,25,Gare de Yvetot,Gare de Rouen-Rive-Droite,05:50:00
4,OCESN003100F140147152,Gare de Le Havre - Gare de Paris-St-Lazare,83,Gare de Rouen-Rive-Droite,Gare de Paris-St-Lazare,06:15:00
6,OCESN003190F040047309,Gare de Dieppe - Gare de Paris-St-Lazare,19,Gare de Dieppe,Gare de Auffay,18:25:00
...,...,...,...,...,...,...
13045,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,5,Gare de Poligny,Gare de St-Lothain,13:50:00
13046,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,8,Gare de St-Lothain,Gare de Domblans-Voiteur,13:55:00
13047,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,10,Gare de Domblans-Voiteur,Gare de Lons-le-Saunier,14:03:00
13048,OCESN895942F0900952655,Gare de Bourg-en-Bresse - Gare de Mouchard,21,Gare de Lons-le-Saunier,Gare de St-Amour,14:13:00


## Plot advanced graph

In [126]:
graph = nx.from_pandas_edgelist(df=cp,
                                edge_attr="duree",
                                create_using=nx.DiGraph())
net = Network(height="100vh")
net.from_nx(graph)
net.show(GRAPHS_PATH + "advanced_graph.html")

In [153]:
graph.edges.data('duree')

OutEdgeDataView([('Gare de Le Havre', 'Gare de Bréauté-Beuzeville', 16), ('Gare de Le Havre', 'Gare de Le Havre-Graville', 5), ('Gare de Le Havre', 'Gare de Etainhus-St-Romain', 10), ('Gare de Bréauté-Beuzeville', 'Gare de Yvetot', 14), ('Gare de Bréauté-Beuzeville', 'Gare de Etainhus-St-Romain', 6), ('Gare de Bréauté-Beuzeville', 'Gare de Foucart-Alvimare', 8), ('Gare de Yvetot', 'Gare de Rouen-Rive-Droite', 25), ('Gare de Yvetot', 'Gare de Pavilly', 12), ('Gare de Yvetot', 'Gare de Motteville', 6), ('Gare de Rouen-Rive-Droite', 'Gare de Paris-St-Lazare', 95), ('Gare de Rouen-Rive-Droite', 'Gare de Oissel', 10), ('Gare de Rouen-Rive-Droite', 'Gare de Morgny', 39), ('Gare de Rouen-Rive-Droite', 'Gare de Sotteville', 6), ('Gare de Paris-St-Lazare', 'Gare de Bernay', 106), ('Gare de Paris-St-Lazare', 'Gare de Evreux - Normandie', 68), ('Gare de Paris-St-Lazare', 'Gare de Mantes-la-Jolie', 38), ('Gare de Dieppe', 'Gare de Auffay', 19), ('Gare de Dieppe', 'Gare de St-Aubin-sur-Scie', 6), (

In [149]:
cp[cp['source'].str.contains('Sewen')]

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time
1655,OCESN030283R0300318235,Gare de Cernay (Haut-Rhin) - Sewen-(Eglise),1,Sewen-(Eglise),Sewen-(11-Gd-Rue),07:04:00
1656,OCESN030283R0300318235,Gare de Cernay (Haut-Rhin) - Sewen-(Eglise),3,Sewen-(11-Gd-Rue),Dolleren-(Mairie),07:05:00
1680,OCESN030283R0400418236,Cernay-(Lycée) - Sewen-(Eglise),1,Sewen-(Eglise),Sewen-(11-Gd-Rue),07:04:00
1681,OCESN030283R0400418236,Cernay-(Lycée) - Sewen-(Eglise),3,Sewen-(11-Gd-Rue),Dolleren-(Mairie),07:05:00


In [150]:
cp[cp['target'].str.contains('Sewen')]

Unnamed: 0,trip_id,trajet,duree,source,target,departure_time
1655,OCESN030283R0300318235,Gare de Cernay (Haut-Rhin) - Sewen-(Eglise),1,Sewen-(Eglise),Sewen-(11-Gd-Rue),07:04:00
1680,OCESN030283R0400418236,Cernay-(Lycée) - Sewen-(Eglise),1,Sewen-(Eglise),Sewen-(11-Gd-Rue),07:04:00
