In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, time
#from DynTM import DynTM

## TimeTable preprocessing

In [2]:
timetable = pd.read_csv('data/timetable.csv')

In [3]:
timetable.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29909 entries, 0 to 29908
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      29909 non-null  int64  
 1   line_label      29909 non-null  object 
 2   arrival_time    29909 non-null  object 
 3   tag             29909 non-null  float64
 4   line_id         29909 non-null  int64  
 5   order           29909 non-null  int64  
 6   travel_time     28672 non-null  float64
 7   stop_time       29909 non-null  float64
 8   station_id      29909 non-null  int64  
 9   latitude        29909 non-null  float64
 10  longitude       29909 non-null  float64
 11  TripNb          29909 non-null  int64  
 12  departure_time  29909 non-null  object 
 13  vehicle_type    29909 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 3.2+ MB


In [4]:
timetable.drop('Unnamed: 0', axis=1, inplace=True)
timetable.tag = timetable.tag.astype(int)
timetable.head()

Unnamed: 0,line_label,arrival_time,tag,line_id,order,travel_time,stop_time,station_id,latitude,longitude,TripNb,departure_time,vehicle_type
0,TGM,03:30:00,0,10,1,1046.0,361.0,72,36.800742,10.192478,1,03:36:01,1
1,TGM,03:53:27,0,10,2,114.0,15.0,73,36.814178,10.29245,1,03:53:42,1
2,TGM,03:55:36,0,10,3,68.0,16.0,74,36.818051,10.30194,1,03:55:52,1
3,TGM,03:57:00,0,10,4,85.0,17.0,75,36.819845,10.305601,1,03:57:17,1
4,TGM,03:58:42,0,10,5,54.0,16.0,76,36.824201,10.308773,1,03:58:58,1


In [5]:
timetable.drop(['tag','latitude','longitude','line_label','stop_time','travel_time','order'], axis=1, inplace=True)
timetable.head()

Unnamed: 0,arrival_time,line_id,station_id,TripNb,departure_time,vehicle_type
0,03:30:00,10,72,1,03:36:01,1
1,03:53:27,10,73,1,03:53:42,1
2,03:55:36,10,74,1,03:55:52,1
3,03:57:00,10,75,1,03:57:17,1
4,03:58:42,10,76,1,03:58:58,1


In [6]:
timetable['nextStop'] = timetable.groupby(['line_id','TripNb']).station_id.shift(-1)
timetable['nextStop_arrival_time']   = timetable.groupby(['line_id','TripNb']).arrival_time.shift(-1)
timetable['nextStop_deaprture_time'] = timetable.groupby(['line_id','TripNb']).departure_time.shift(-1)

In [7]:
timetable.rename({'station_id':'currentStop'}, axis=1, inplace=True)

In [8]:
timetable = timetable[['TripNb', 'line_id', 'vehicle_type', 'currentStop', 'nextStop', 'departure_time', 'nextStop_arrival_time', 'nextStop_deaprture_time']]
timetable.head()

Unnamed: 0,TripNb,line_id,vehicle_type,currentStop,nextStop,departure_time,nextStop_arrival_time,nextStop_deaprture_time
0,1,10,1,72,73.0,03:36:01,03:53:27,03:53:42
1,1,10,1,73,74.0,03:53:42,03:55:36,03:55:52
2,1,10,1,74,75.0,03:55:52,03:57:00,03:57:17
3,1,10,1,75,76.0,03:57:17,03:58:42,03:58:58
4,1,10,1,76,77.0,03:58:58,03:59:52,04:00:06


In [9]:
timetable = timetable.dropna()

In [10]:
timetable.nextStop = timetable.nextStop.astype(int)

In [11]:
timetable.isnull().sum()

TripNb                     0
line_id                    0
vehicle_type               0
currentStop                0
nextStop                   0
departure_time             0
nextStop_arrival_time      0
nextStop_deaprture_time    0
dtype: int64

In [12]:
timetable.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27967 entries, 0 to 29907
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   TripNb                   27967 non-null  int64 
 1   line_id                  27967 non-null  int64 
 2   vehicle_type             27967 non-null  int64 
 3   currentStop              27967 non-null  int64 
 4   nextStop                 27967 non-null  int32 
 5   departure_time           27967 non-null  object
 6   nextStop_arrival_time    27967 non-null  object
 7   nextStop_deaprture_time  27967 non-null  object
dtypes: int32(1), int64(4), object(3)
memory usage: 1.8+ MB


In [13]:
timetable.to_csv('data/preprocessed_timetable.csv', index = False)

### Distance Matrix preprocessing 

In [14]:
# Your method to calculate distance between two samples
def haversine_distance(x,y):
    R = 6378137
    #convert to raduis
    lat1  = x[0] * np.pi/180
    long1 = x[1] * np.pi/180
    lat2  = y[0] * np.pi/180
    long2 = y[1] * np.pi/180
    #calculate haversine distance
    delta_longitude = long1 - long2
    delta_latitude = lat1 - lat2
    a = (np.sin(delta_latitude/2)**2) + np.cos(lat1)*np.cos(lat2)*(np.sin(delta_longitude/2)**2)
    c = 2*np.arctan2(np.sqrt(a),np.sqrt(1-a))
    distance = R*c
    return distance

In [15]:
station_paths = pd.read_csv('data/stations_paths.csv')
stations_coord = station_paths[['station_id', 'latitude', 'longitude']].drop_duplicates().reset_index(drop=True).set_index('station_id')
stations_coord.to_csv('data/station_coord.csv')

In [16]:
stations_coord.shape[0]

297

In [17]:
dim = stations_coord.shape[0]
Dmatrix = np.ones((dim, dim))*np.inf
i,j = 0,0

for coord_station1 in stations_coord.values :
    j=0
    for coord_station2 in stations_coord.values :
        if j >= i :
            continue
        else:
            Dmatrix[i,j] = haversine_distance(coord_station1,coord_station2)
        j+=1
    i+=1

In [18]:
labels = stations_coord.index
labels

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            288, 289, 290, 291, 292, 293, 294, 295, 296, 297],
           dtype='int64', name='station_id', length=297)

In [19]:
labels = stations_coord.index
Dmatrix = pd.DataFrame (Dmatrix, columns = labels, index = labels)
Dmatrix.head()

station_id,1,2,3,4,5,6,7,8,9,10,...,288,289,290,291,292,293,294,295,296,297
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
2,345.76377,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
3,1114.787363,818.126512,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
4,1494.572218,1231.048492,439.803597,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
5,2128.106798,1873.553926,1074.754223,642.811691,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf


In [20]:
Dmatrix.to_csv('data/Dmatrix.csv', index = True)