In [2]:
import pandas as pd
import re
import numpy as np
from keplergl import KeplerGl

# Data prep

In [2]:
# Cleaning the stops from letters and zeros
stops = pd.read_csv('../data/raw/gtfs/stops.txt')
stops = stops.drop(columns=['stop_code', 'stop_desc','zone_id','stop_url', 'location_type', 'parent_station'])
stops = stops # remove the .copy() if you want the change to be made in stops, or do it directly there
stops['stop_id'] = stops['stop_id'].map(lambda x: int(re.sub('[^0-9]', '', x))) # clean the letters and zeros
stops = stops.drop_duplicates(subset=['stop_id'], keep='first') # remove the duplicates
len(stops)

2842

In [3]:
line_stops = pd.read_csv('../data/processed/assignment1/line_stops.csv')
# Extracting the information we need from line_stops
line_stops = line_stops[['lineId','direction', 'name_ascii', 'stop_id_int','order', 'lat', 'long']]
line_stops['type'] = line_stops['lineId'].apply(lambda x: str(x[-1])) # getting the type m/b/t 
line_stops['lineId'] = line_stops['lineId'].apply(lambda x: int(x[:-1])) # lineId number
line_stops = line_stops.drop(line_stops[line_stops.lineId >= 100].index) # dropping the night buses
line_stops = line_stops.drop_duplicates(subset=['lineId', 'direction', 'stop_id_int','order'])
len(line_stops)

3596

# Speed - Assignment 1

#TODO what to do with inf speed?

In [4]:
vehicle_speed_path = '../data/processed/assignment1/vehicleSpeedReformatted.csv'
vehicle_speed = pd.read_csv(vehicle_speed_path)
vehicle_speed.head(1)

Unnamed: 0,LineId,FromStop,ToStop,Day,LineIdFormatted,LineId_GeoMerge,Type,Direction,fromIndex,toIndex,FromStop_lat,FromStop_lon,ToStop_lat,ToStop_lon,geojson,hour,speed
0,37,2957,5810,,line37,037b-1,b,1.0,49.0,57.0,50.821413,4.341859,50.818572,4.340952,"{'type': 'FeatureCollection', 'features': [{'t...",00:00,14.936818


In [5]:
# vehicle_speed = vehicle_speed[['LineId', 'Type', 'FromStop', 'ToStop', 'Direction', 'hour', 'speed', 'geojson']]
vehicle_speed = vehicle_speed[['LineId', 'FromStop', 'ToStop', 'Direction', 'hour', 'speed', 'geojson','FromStop_lat','FromStop_lon','ToStop_lat','ToStop_lon']]

# Remove the inf values - we drop like 20k values
vehicle_speed = vehicle_speed.replace([np.inf], np.nan)
vehicle_speed = vehicle_speed.dropna() 

vehicle_speed = vehicle_speed.astype({'Direction': int})
vehicle_speed.head(2)

Unnamed: 0,LineId,FromStop,ToStop,Direction,hour,speed,geojson,FromStop_lat,FromStop_lon,ToStop_lat,ToStop_lon
0,37,2957,5810,1,00:00,14.936818,"{'type': 'FeatureCollection', 'features': [{'t...",50.821413,4.341859,50.818572,4.340952
5,37,2957,5810,1,05:00,2.299439,"{'type': 'FeatureCollection', 'features': [{'t...",50.821413,4.341859,50.818572,4.340952


# Delays - Assignment 2

- how should the names column names be?
- problem with the double terminus
- remove terminus stop?
- should we add the order?

In [6]:
# Using the delays and creating the Time coulmn from the Hour column: 7 -> '07:00:00'
delays_path = '../data/processed/assignment2/final_filtered_delays.csv'

vehicle_delays = pd.read_csv(delays_path)
vehicle_delays.drop(columns=['Unnamed: 0'], inplace=True)
vehicle_delays['Time'] = vehicle_delays['Hour'].apply(lambda x: str(x.split(':')[0]+':00'))

# vehicle_delays = vehicle_delays[['LineId', 'CurrentStop', 'Date', 'Time', 'Delay', 'Day', 'Hour']]
vehicle_delays.head(3)

Unnamed: 0,Timestamp,LineId,TerminusStop,CurrentStop,Date,Hour,Day,Delay,ExpectedArrivalTime,HourInterval,Time
0,1630914886924,12,1780,9600,20210906,07:54:46,0,286,07:50:00,7,07:00
1,1630914886924,12,1780,2250,20210906,07:54:46,0,-74,07:56:00,7,07:00
2,1630914978881,12,9600,1780,20210906,07:56:18,0,78,07:55:00,7,07:00


In [7]:
# joining on lineId and stopid because we can have one station on different lines 
vehicle_delays = vehicle_delays.merge(line_stops, how='inner', left_on=['LineId','CurrentStop'], right_on=['lineId','stop_id_int'] )

In [10]:
vehicle_delays = vehicle_delays[['LineId', 'CurrentStop', 'Date','Hour', 'Time', 'Delay','direction', 'Day', 'type', 'name_ascii']]

In [11]:
# cleaninug up the visual_data
vehicle_delays = vehicle_delays.drop_duplicates(subset=['LineId', 'CurrentStop', 'direction', 'Time','Day', 'type'])
# visual_data.rename(columns={'direction': 'Direction', 'order': 'Order', 'type': 'Type'}, inplace=True)
# visual_data = visual_data.astype({'LineId': str, 'Delay': 'int', 'Direction': int, 'Order': int})

In [12]:
#TODO Should we add order column?
#grouping by to get the result
grouped_delays = vehicle_delays.groupby(['LineId', 'type', 'CurrentStop', 'direction',  'Day', 'Time'], as_index=False)['Delay'].mean()
grouped_delays.head()

Unnamed: 0,LineId,type,CurrentStop,direction,Day,Time,Delay
0,1,m,8011,2,0,05:00,-527.0
1,1,m,8011,2,0,06:00,66.0
2,1,m,8011,2,0,07:00,-47.0
3,1,m,8011,2,0,08:00,-96.0
4,1,m,8011,2,0,09:00,-22.0


# The merge

In [13]:
# Delays
grouped_delays.head(1)
print(len(grouped_delays))

133839


In [14]:
# Speed
vehicle_speed.head(1)
print(len(vehicle_speed))

54366


In [15]:
# Merging on FromStop

In [16]:
visualization_data = grouped_delays.merge(vehicle_speed, how='inner', left_on=['LineId', 'CurrentStop', 'direction', 'Time'], right_on=['LineId','FromStop','Direction','hour'])
visualization_data = visualization_data.drop(columns=['direction'])

In [17]:
visualization_data.head(1)
# print(len(visualization_data))

Unnamed: 0,LineId,type,CurrentStop,Day,Time,Delay,FromStop,ToStop,Direction,hour,speed,geojson,FromStop_lat,FromStop_lon,ToStop_lat,ToStop_lon
0,1,m,8011,0,05:00,-527.0,8011,8271,2,05:00,11.219475,"{'type': 'FeatureCollection', 'features': [{'t...",50.850095,4.352165,50.852347,4.347749


In [18]:
# TODO add the Stop Name

Unnamed: 0,name_ascii,stop_id_int
0,GARE DE L'OUEST,8733
1,BEEKKANT,8742
2,ETANGS NOIRS,8292
3,COMTE DE FLANDRE,8282
4,SAINTE-CATHERINE,8272
...,...,...
3591,DORENT,9686
3592,PATHE,5964
3593,MOZART,3815
3594,BOLLINCKX,3851


In [19]:
# adding the 'line'+id to the LineId column so that we use it as string in the visualization
visualization_data['LineId'] = visualization_data['LineId'].apply(lambda x: 'line'+str(x)) 

In [32]:
visualization_data_stops = visualization_data.merge(line_stops[['name_ascii','stop_id_int']], how='inner', left_on='CurrentStop', right_on='stop_id_int').drop(columns=['stop_id_int'])

In [37]:
visualization_data = visualization_data_stops.drop_duplicates()

In [40]:
visualization_data.head(3)

Unnamed: 0,LineId,type,CurrentStop,Day,Time,Delay,FromStop,ToStop,Direction,hour,speed,geojson,FromStop_lat,FromStop_lon,ToStop_lat,ToStop_lon,name_ascii
0,line1,m,8011,0,05:00,-527.0,8011,8271,2,05:00,11.219475,"{'type': 'FeatureCollection', 'features': [{'t...",50.850095,4.352165,50.852347,4.347749,DE BROUCKERE
2,line1,m,8011,1,05:00,-424.0,8011,8271,2,05:00,11.219475,"{'type': 'FeatureCollection', 'features': [{'t...",50.850095,4.352165,50.852347,4.347749,DE BROUCKERE
4,line1,m,8011,0,06:00,66.0,8011,8271,2,06:00,11.55402,"{'type': 'FeatureCollection', 'features': [{'t...",50.850095,4.352165,50.852347,4.347749,DE BROUCKERE


In [3]:
visualization_data.to_csv('../data/processed/assignment2/visualization_data.csv')


NameError: name 'visualization_data' is not defined

# Visuals

In [4]:
vd = pd.read_csv('../data/processed/assignment2/visualization_data.csv')

In [6]:
vd1 = vd.head(1000)

In [None]:
# code for checking duplicates, just used for checking what we are dropping.
df_droplog = pd.DataFrame()
mask = line_stops[['lineId','direction', 'stop_id_int','order']].duplicated()
df_keep = line_stops[['lineId','direction', 'stop_id_int','order']].loc[~mask]
df_droplog = df_droplog.append(line_stops[['lineId','direction', 'stop_id_int','order']].loc[mask])

In [18]:
vd1['geojson'][0]

"{'type': 'FeatureCollection', 'features': [{'type': 'Feature', 'properties': {}, 'geometry': {'type': 'LineString', 'coordinates': [[50.84187285032085, 4.4645410055363985], [50.843270588077445, 4.463606785475749], [50.84351703479166, 4.463442585074531], [50.84375538724015, 4.463282626095395], [50.844054901499106, 4.463081613381056], [50.84409897425753, 4.463051885483516], [50.84411876383088, 4.463036307046374], [50.84418982464806, 4.462982495971582], [50.844271686632965, 4.462911668194964], [50.84432656627108, 4.462857824128111], [50.844349059555086, 4.4628337318032685], [50.84439764535225, 4.462781294741169], [50.8444606315313, 4.462707588564265], [50.844541616144696, 4.462609780616862], [50.84463880008582, 4.462489286921104], [50.84525879322191, 4.461722371197469], [50.84532538132157, 4.461640150064926], [50.84545045660804, 4.461488470075966], [50.84551434090701, 4.4614147623012075], [50.84555842904107, 4.461365153347819], [50.84559351861357, 4.461326885660318], [50.84563670437308, 

In [7]:
map_1 = KeplerGl(height=600, data={"data": vd1})
# map_1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [8]:
map_1

KeplerGl(data={'data':      Unnamed: 0 LineId type  CurrentStop  Day   Time  Delay  FromStop  ToStop  \
0     …