In [1]:
import pandas as pd
import re
import numpy as np
from keplergl import KeplerGl

In [2]:
# Cleaning the stops from letters and zeros
stops = pd.read_csv('./data/gtfs3Sept/stops.txt')
stops = stops.drop(columns=['stop_code', 'stop_desc','zone_id','stop_url', 'location_type', 'parent_station'])
stops = stops # remove the .copy() if you want the change to be made in stops, or do it directly there
stops['stop_id'] = stops['stop_id'].map(lambda x: int(re.sub('[^0-9]', '', x))) # clean the letters and zeros
stops.drop_duplicates(subset=['stop_id'], keep='first') # remove the duplicates

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,89,MONTGOMERY,50.838006,4.408970
1,470,SIMONIS,50.863666,4.329612
2,471,SIMONIS,50.863732,4.329236
3,472,SIMONIS,50.863543,4.329023
4,473,SIMONIS,50.863418,4.330031
...,...,...,...,...
2993,34,TOMBERG,50.844316,4.425819
2994,44,TRONE,50.840813,4.366354
2995,32,VANDERVELDE,50.847387,4.446705
2996,6,VEEWEYDE,50.829391,4.300578


- how should the names column names be?
- problem with the double terminus
- remove terminus stop?
- should we add the order?

In [3]:
# Using the delays and creating the Time coulmn from the Hour column: 7 -> '07:00:00'
delays = pd.read_csv('./final_filtered_delays.csv')
delays.drop(columns=['Unnamed: 0'], inplace=True)
delays['Time'] = delays['Hour'].apply(lambda x: str(x.split(':')[0]+':00:00'))

In [4]:
line_stops = pd.read_csv('./line_stops.csv')
line_stops.head()

Unnamed: 0,lineId,direction,stop_id,stop_id_int,name,name_ascii,lat,long,lambert_x,lambert_y,order
0,001m,1,8733,8733,Gare de l'Ouest,GARE DE L'OUEST,50.848999,4.320948,146633.5,170956.4,1
1,001m,1,8742,8742,Beekkant,BEEKKANT,50.853386,4.322974,146776.5,171444.3,2
2,001m,1,8292,8292,Étangs Noirs,ETANGS NOIRS,50.857125,4.333143,147492.7,171859.9,3
3,001m,1,8282,8282,Comte de Flandre,COMTE DE FLANDRE,50.854705,4.340542,148013.6,171590.4,4
4,001m,1,8272,8272,Sainte-Catherine,SAINTE-CATHERINE,50.8519,4.348012,148539.5,171278.2,5


In [5]:
# Extracting the information we need from line_stops
line_stops = line_stops[['lineId','direction', 'name_ascii', 'stop_id_int','order', 'lat', 'long']]
line_stops['type'] = line_stops['lineId'].apply(lambda x: str(x[-1])) # lineId number
line_stops['lineId'] = line_stops['lineId'].apply(lambda x: int(x[:-1])) # getting the type m/b/t 
line_stops = line_stops.drop(line_stops[line_stops.lineId >= 100].index) # dropping the night buses

In [6]:
# joining on lineId and stopid because we can have one station on different lines 
visual_data = delays.merge(line_stops[['lineId','direction', 'stop_id_int','order','type','name_ascii']], how='inner', left_on=['LineId','CurrentStop'], right_on=['lineId','stop_id_int'] )

# cleaninug up the visual_data
visual_data.drop(columns=['lineId', 'stop_id_int', 'Timestamp', 'TerminusStop'], inplace=True)
visual_data.rename(columns={'direction': 'Direction', 'order': 'Order', 'type': 'Type'}, inplace=True)
visual_data = visual_data.astype({'LineId': str, 'Delay': 'int', 'Direction': int, 'Order': int})

In [7]:
# adding the long/latitude to the visual_data
visual_data = visual_data.merge(stops.drop_duplicates(), how='inner', left_on='CurrentStop', right_on='stop_id')

In [8]:
#TODO Should we add order column?
#grouping by to get the result
grouped_visual_data = visual_data.groupby(['LineId', 'Type', 'CurrentStop', 'Direction', 'Time', 'Day'], as_index=False)['Delay'].mean()

In [9]:
grouped_visual_data['Delay'] = grouped_visual_data['Delay'].astype(int)

In [10]:
# i should check this, why do i merge stops with visual data and then again here?
grouped_visual_data = grouped_visual_data.merge(stops, how='inner', left_on='CurrentStop', right_on='stop_id')
grouped_visual_data.drop(columns=['stop_id'], inplace=True)

In [11]:
grouped_visual_data = grouped_visual_data.drop_duplicates()
#useless, kepler converts them
grouped_visual_data = grouped_visual_data.astype({'LineId': str, 'Direction': str, 'Day': str})



In [14]:
# adding the 'line'+id to the LineId column so that we use it as string in the visualization
grouped_visual_data['LineId'] = grouped_visual_data['LineId'].apply(lambda x: 'line'+x) 
grouped_visual_data

Unnamed: 0,LineId,Type,CurrentStop,Direction,Time,Day,Delay,stop_name,stop_lat,stop_lon
0,line1,m,8011,2,05:00:00,0,-321,DE BROUCKERE,50.850095,4.352165
1,line1,m,8011,2,05:00:00,1,-411,DE BROUCKERE,50.850095,4.352165
2,line1,m,8011,2,06:00:00,0,-76,DE BROUCKERE,50.850095,4.352165
3,line1,m,8011,2,06:00:00,1,-237,DE BROUCKERE,50.850095,4.352165
4,line1,m,8011,2,07:00:00,0,-67,DE BROUCKERE,50.850095,4.352165
...,...,...,...,...,...,...,...,...,...,...
148911,line98,b,9686,2,16:00:00,1,198,DORENT,50.794407,4.302507
148912,line98,b,9686,2,17:00:00,0,-291,DORENT,50.794407,4.302507
148913,line98,b,9686,2,17:00:00,1,143,DORENT,50.794407,4.302507
148914,line98,b,9686,2,18:00:00,0,41,DORENT,50.794407,4.302507


In [None]:
grouped_visual_data.to_csv('./grouped_visual_data_ass2.csv')

In [None]:
# code for checking duplicates, just used for checking what we are dropping.
df_droplog = pd.DataFrame()
mask = line_stops[['lineId','direction', 'stop_id_int','order']].duplicated()
df_keep = line_stops[['lineId','direction', 'stop_id_int','order']].loc[~mask]
df_droplog = df_droplog.append(line_stops[['lineId','direction', 'stop_id_int','order']].loc[mask])

In [16]:
map_1 = KeplerGl(height=600, data={"data": delays})
map_1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data':              Timestamp  LineId  TerminusStop  CurrentStop      Date      Hour  \
0     …