In [2]:
import pandas as pd
import gtfs_kit
import folium
import seaborn

In [3]:
# load gtfs feed into pandas dataframes
feed = gtfs_kit.read_feed("gtfs_fp2023_2023-07-12_04-15.zip", dist_units="km")

In [13]:
# only consider parent stations when searching for destinations, because the platforms dont matter
# this makes it more or less useless for last kilometer transportation (which is fine, for now)
every_station = feed .stops.loc[feed.stops.stop_id.str.contains("Parent")]
every_station.loc[every_station.stop_name.str.contains("singen", case=False)]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_code
38360,Parent8014407,Norsingen,47.939249,7.726194,1.0,,
38403,Parent8014533,Geisingen,47.921343,8.653947,1.0,,
38408,Parent8014558,Singen (Hohentwiel),47.758437,8.840384,1.0,,
38461,Parent8029338,Mössingen,48.404309,9.046529,1.0,,
38463,Parent8029343,Bisingen,48.312769,8.921034,1.0,,
38471,Parent8029434,Trossingen Bahnhof,48.087662,8.585154,1.0,,
38485,Parent8063365,Singen (Htw)-Landesgartenschau,47.760738,8.827044,1.0,,
38496,Parent8099985,Singen (Htw) Industriegebiet,47.759451,8.87213,1.0,,
38557,Parent8500212,Oensingen,47.285036,7.709872,1.0,,
38783,Parent8502105,Othmarsingen,47.407433,8.214806,1.0,,


In [38]:
# this should be done over an selector from data above
# station = every_station.loc[every_station.stop_id.str.contains("Rotkreuz", case=False)]
station = every_station.loc[every_station.stop_id.str.contains("Parent8500212", case=False)]

manual_exclusions = ["IC21"]

In [39]:
station

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_code
38557,Parent8500212,Oensingen,47.285036,7.709872,1.0,,


In [40]:
# get all platforms
platforms = feed.stops.loc[feed.stops["parent_station"] == station["stop_id"].values[0]]

In [41]:
all_stop_times = feed.stop_times.loc[feed.stop_times["stop_id"].isin(platforms["stop_id"].values)]

In [42]:
feed.trips.loc[feed.trips["trip_id"].isin(all_stop_times["trip_id"].unique())]

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id
105515,91-20-A-j23-1,TA+ak940,1007.TA.91-20-A-j23-1.104.R,Langendorf,7708,1,
105516,91-20-A-j23-1,TA+fu170,1008.TA.91-20-A-j23-1.105.R,Langendorf,7708,1,
105517,91-20-A-j23-1,TA+wg400,1009.TA.91-20-A-j23-1.105.R,Langendorf,7706,1,
105519,91-20-A-j23-1,TA+5f780,1010.TA.91-20-A-j23-1.106.R,Langendorf,7706,1,
105520,91-20-A-j23-1,TA+gop50,1011.TA.91-20-A-j23-1.332.R,Langendorf,7706,1,
...,...,...,...,...,...,...,...
361871,91-J-Y-j23-1,TA+d5,83.TA.91-J-Y-j23-1.5.R,Oensingen,419,1,1404
361872,91-J-Y-j23-1,TA+d5,84.TA.91-J-Y-j23-1.5.R,Oensingen,463,1,1387
361873,91-J-Y-j23-1,TA+d5,85.TA.91-J-Y-j23-1.5.R,Oensingen,467,1,1392
361874,91-J-Y-j23-1,TA+c9,86.TA.91-J-Y-j23-1.5.R,Oensingen,457,1,1383


In [43]:
route_ids = feed.trips.loc[feed.trips["trip_id"].isin(all_stop_times["trip_id"].unique())]["route_id"].unique()

# -Y are artificial routes
# excludes too many...
# route_ids = list(filter(lambda id: "-Y" not in id, all_routes))
# routes = feed.routes.loc[feed.routes["route_id"].isin(route_ids)]


all_routes = feed.routes.loc[feed.routes["route_id"].isin(route_ids)]

routes = all_routes.loc[all_routes["route_short_name"] != "EXT"]
routes
# there are broken trips and routes which claim to stop but actually dont

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type
134,91-20-A-j23-1,11,S20,,S,109
150,91-22-j23-1,11,S22,,S,109
388,91-5-A-j23-1,11,IC5,,IC,102
908,91-F-Y-j23-1,81,R,,R,106
913,91-J-Y-j23-1,81,R,,R,106


In [44]:
with_wrong_route = feed.trips.loc[feed.trips["trip_id"].isin(all_stop_times["trip_id"].unique())]
wrong_trips = with_wrong_route.loc[with_wrong_route["route_id"] == "91-21-D-j23-1"]
wrong_stops = feed.stop_times.loc[feed.stop_times["trip_id"].isin(wrong_trips["trip_id"].values)]
wrong_stops.loc[wrong_stops["stop_id"].isin(platforms["stop_id"].values)]
len(wrong_trips)
wrong_stops.loc[wrong_stops["stop_id"].isin(platforms["stop_id"].values)]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type


In [45]:
route_trips = feed.trips.loc[feed.trips["route_id"].isin(route_ids)]

# all stops of the trips
relevant_stops = feed.stop_times.loc[
    feed.stop_times["trip_id"].isin(route_trips["trip_id"].values)
]

# parse arrival and departure to timedeltas
relevant_stops.loc[:, "arrival_time_parsed"] = pd.to_timedelta(
    relevant_stops["arrival_time"]
)
relevant_stops.loc[:, "departure_time_parsed"] = pd.to_timedelta(
    relevant_stops["departure_time"]
)

# limit arrival and departure to "normal" times 06-22
# pickup, dropoff type not 0 means no normal passenger transfer
filtered_stops = relevant_stops.loc[
    ((relevant_stops["pickup_type"] == 0) & (relevant_stops["drop_off_type"] == 0))
    & (
        (
            (relevant_stops["arrival_time_parsed"] > pd.Timedelta(6, unit="h"))
            & (relevant_stops["arrival_time_parsed"] < pd.Timedelta(22, unit="h"))
        )
        | (
            (relevant_stops["departure_time_parsed"] > pd.Timedelta(6, unit="h"))
            & (relevant_stops["departure_time_parsed"] < pd.Timedelta(22, unit="h"))
        )
    )
]

merged = pd.merge(route_trips, filtered_stops, on="trip_id")

# find longest trip, so the path generation does not get confused by direction or shorter trips
# might be problematic if the longest is a rare trip and not representative of the usual stops
longest_trips = merged.loc[merged.groupby(["route_id"])["stop_sequence"].idxmax()]
longest_trips_stop_times = feed.stop_times.loc[
    feed.stop_times["trip_id"].isin(longest_trips["trip_id"].values)
]

# find stops to display
longest_trips_parent_stops = feed.stops.loc[
    feed.stops.loc[:, "stop_id"].isin(longest_trips_stop_times["stop_id"].values)
]

# parent stops and stops with no parent
all_stops = pd.concat(
    [
        longest_trips_parent_stops.drop_duplicates(["parent_station"]),
        longest_trips_parent_stops[longest_trips_parent_stops["parent_station"].isna()],
    ]
)


# join all the data into one dataframe
stop_data = pd.merge(longest_trips_stop_times, route_trips, on="trip_id")
stop_data = pd.merge(stop_data, routes, on="route_id")
stop_data = pd.merge(stop_data, longest_trips_parent_stops, on="stop_id")
longest_trips_stop_times.loc[longest_trips_stop_times["trip_id"] == "258.TA.91-21-D-j23-1.76.H"]
relevant_stops.loc[relevant_stops["trip_id"] == "258.TA.91-21-D-j23-1.76.H"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_stops.loc[:, "arrival_time_parsed"] = pd.to_timedelta(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_stops.loc[:, "departure_time_parsed"] = pd.to_timedelta(


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,arrival_time_parsed,departure_time_parsed


In [46]:
stop_data.drop_duplicates(subset=["parent_station"], keep='first')

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id,service_id,trip_headsign,...,route_short_name,route_long_name,route_desc,route_type,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_code
0,1173.TA.91-20-A-j23-1.128.R,05:35:00,05:35:00,8500218:0:1,1,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Olten,47.351934,7.907699,,Parent8500218,
1,1173.TA.91-20-A-j23-1.128.R,05:37:00,05:37:00,8500217:0:2,2,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Olten Hammer,47.348556,7.897727,,Parent8500217,
2,1173.TA.91-20-A-j23-1.128.R,05:40:00,05:40:00,8500216:0:4,3,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Wangen bei Olten,47.341404,7.868362,,Parent8500216,
3,1173.TA.91-20-A-j23-1.128.R,05:42:00,05:42:00,8500215:0:4,4,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Hägendorf,47.330269,7.84382,,Parent8500215,
4,1173.TA.91-20-A-j23-1.128.R,05:45:00,05:46:00,8500214:0:2,5,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Egerkingen,47.314863,7.798967,,Parent8500214,
5,1173.TA.91-20-A-j23-1.128.R,05:48:00,05:48:00,8500213:0:4,6,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Oberbuchsiten,47.305245,7.771721,,Parent8500213,
6,1173.TA.91-20-A-j23-1.128.R,05:52:00,05:52:00,8500212:0:4,7,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Oensingen,47.285036,7.709872,,Parent8500212,
7,1173.TA.91-20-A-j23-1.128.R,05:54:00,05:54:00,8500211:0:4,8,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Niederbipp,47.270403,7.694816,,Parent8500211,
8,1173.TA.91-20-A-j23-1.128.R,06:00:00,06:00:00,8500210:0:3,9,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Wangen an der Aare,47.231958,7.656233,,Parent8500210,
9,1173.TA.91-20-A-j23-1.128.R,06:02:00,06:02:00,8500209:0:1,10,0,0,91-20-A-j23-1,TA+0e770,Biel/Bienne,...,S20,,S,109,Deitingen,47.218225,7.618908,,Parent8500209,


In [47]:
map = folium.Map(tiles="cartodbpositron")

# own rough path generation, because there is no shapes data
grouped = stop_data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")
colors = seaborn.color_palette("viridis", n_colors=grouped.ngroups).as_hex()
idx = 0
skipped_routes = []
path_layer = folium.FeatureGroup(name="Paths")
for name, group in grouped:
    destination_stop_in_route = False
    stop_coords = []
    tooltip_content = f"{group.iloc[0]['route_short_name']}"

    for row_index, row in group.iterrows():
        stop_coords.append(row[["stop_lat", "stop_lon"]].values)
        tooltip_content += f"<br/>{row['stop_name']}"

        # remove leftover markers or pre condition it
        if (row["parent_station"] == station["stop_id"]).bool():
            destination_stop_in_route = True

    # hide a few more routes
    if destination_stop_in_route:
        folium.PolyLine(stop_coords, tooltip=tooltip_content, color=colors[idx]).add_to(path_layer)
        idx += 1
    else:
        skipped_routes.append(name)

path_layer.add_to(map)

marker_layer = folium.FeatureGroup(name="Stops")
# drop duplicates kinda weird, why do some routes get all stops removed
shown_stops = stop_data.drop_duplicates(subset=["parent_station"], keep='first')
# remove routes from map if they arent present here??
shown_stops = shown_stops.loc[~shown_stops["trip_id"].isin(skipped_routes)]

for row in shown_stops.to_dict(orient="records"):
    folium.CircleMarker(
        location=[row["stop_lat"], row["stop_lon"]],
        popup=row["stop_name"],
        radius=3,
        fill=True,
        color=colors[0],
        weight=0.5,
        fillOpacity= 0.75,
    ).add_to(marker_layer)

marker_layer.add_to(map)


bounds = [
    (stop_data.stop_lat.min(), stop_data.stop_lon.min()),
    (stop_data.stop_lat.max(), stop_data.stop_lon.max()),
]
map.fit_bounds(bounds, padding=[1, 1])
map