In [1]:
import requests
import json
import os
from dotenv import load_dotenv
import osmnx as ox

In [2]:
import asyncio
import aiohttp
    

In [3]:
load_dotenv()  # take environment variables

True

In [4]:
MTA_API_KEY = os.getenv("MTA_API_KEY")

In [5]:
# getting data given a route id
async def get_bus_route(route_id):
    url = f"https://bustime.mta.info/api/where/stops-for-route/MTA%20NYCT_{route_id}.json?key={MTA_API_KEY}&includePolylines=false&version=2"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response
    else:
        raise Exception(f"Error: bus route {route_id} returned response code {response['code']}")

In [6]:
route_id = "B52" # Example route
route_data = await get_bus_route(route_id)
print(json.dumps(route_data, indent=4))



{
    "code": 200,
    "currentTime": 1747094727403,
    "data": {
        "entry": {
            "polylines": [],
            "routeId": "MTA NYCT_B52",
            "stopGroupings": [
                {
                    "ordered": true,
                    "stopGroups": [
                        {
                            "id": "1",
                            "name": {
                                "name": "DOWNTOWN BKLYN TILLARY ST via GATES",
                                "names": [
                                    "DOWNTOWN BKLYN TILLARY ST via GATES"
                                ],
                                "type": "destination"
                            },
                            "polylines": [],
                            "stopIds": [
                                "MTA_504965",
                                "MTA_504119",
                                "MTA_304196",
                                "MTA_304197",
                                "MT

In [None]:
# getting data given a route id
async def get_route_schedule(route_id):
    ''' 
    retrieve the route schedule for a route_id
    '''
    url = f"https://bustime.mta.info/api/where/trips-for-route/MTA%20NYCT_{route_id}.json?key={MTA_API_KEY}&includeSchedule=true"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response['data']['list']
    else:
        raise Exception(f"Error: bus stop {route_id} returned response code {response['code']}")

In [None]:
# testing getting a bus schedule
stop_data = await get_route_schedule(route_id)
print(json.dumps(stop_data, indent=4))

[
    {
        "frequency": null,
        "schedule": {
            "frequency": null,
            "nextTripId": "",
            "previousTripId": "MTA NYCT_FP_B5-Weekday-SDon-108700_B26_218",
            "stopTimes": [
                {
                    "arrivalTime": 69360,
                    "departureTime": 69360,
                    "distanceAlongTrip": 0.08656897032969031,
                    "stopHeadsign": "",
                    "stopId": "MTA_307175"
                },
                {
                    "arrivalTime": 69466,
                    "departureTime": 69466,
                    "distanceAlongTrip": 285.61860445152655,
                    "stopHeadsign": "",
                    "stopId": "MTA_307699"
                },
                {
                    "arrivalTime": 69513,
                    "departureTime": 69513,
                    "distanceAlongTrip": 409.6188950017041,
                    "stopHeadsign": "",
                    "stopId": "MTA_30742

In [None]:
def shorten_id(route_id):
    ''' 
    shorted a bus route id to remove "MTA NYCT_"
    '''
    shortened_id = route_id.split('_')[1]
    return shortened_id

In [None]:
def get_stop_groups(route_data):
    ''' 
    get stop groups (all directions + stop orders for a route) given some route data
    '''
    if not route_data['data']['entry']['stopGroupings'][0]['ordered']:
        print(f"{route_data['routeId']} not ordered")
        return {}
    
    route_lists = {}
    stopGroups = route_data['data']['entry']['stopGroupings'][0]['stopGroups']
    route_id = shorten_id(route_data['data']['entry']['routeId'])
    for group in stopGroups:

        route_lists[f'{route_id} - {group['name']['name']}'] = group['stopIds']
        #print(group['stopIds'][0])
    
    return route_lists

    

In [None]:
# testing get stop groupps
test_route_lists = get_stop_groups(route_data)

In [None]:
# load bus info dict from json file
json_path='./graph_data/bus_info_dict.json'
with open(json_path) as json_file:
    bus_info = json.load(json_file)

In [None]:
print(bus_info['MTA_901601'])

In [None]:
# opening all routes file
json_path='./graph_data_archive/all_routes.json'
with open(json_path) as json_file:
    all_routes = json.load(json_file)

In [None]:
# how many routes were successfully retrieved
print(len(all_routes))

267


In [None]:
# testing shorted ID
shorten_id(all_routes[0]['id'])

'M34+'

In [None]:
# testing getting route data without awaiting (returned a coroutine)
route_data = get_bus_route('B52')
print(route_data)

<coroutine object get_bus_route at 0x1794b1240>


In [18]:
from tqdm import tqdm
from collections import defaultdict
#loading bar module

In [None]:
# save a dictionary of routes whose schedules aren't found
unknown_routes = defaultdict(set)

# save a dictionary in the stops involved in the returned route schedule - might be different from the routes I already have
updated_routes = {}

def get_edge_times(first_stop, route_schedule, route_id, route_name, group):
    '''
    give first stop of a route and possible route schedules, find the time it takes to get between each stop
    '''
    edge_times = {}
    schedule_found = False
    group_stopTimes = None

    # the stopIds don't always match up with the ids I have stored, so I'll store a list of updated routes

    for schedule in route_schedule:
        #going through each schedule and seeing if one of the first stop matches the first stop in the stored route
        if first_stop == schedule['schedule']['stopTimes'][0]['stopId']:
            schedule_found = True
            group_stopTimes= schedule['schedule']['stopTimes']
            break

    if not schedule_found:
        # a bunch of routes didn't have the first stop matching the first stop in my route list, so I'll add them to a dict of unknown routes
        
        unknown_routes[route_id].add(route_name)
        
        return {}
    updated_routes[route_name] = []
    #looping through each stop (except for the first one) and recording num seconds between the current stop and the previous stop
    updated_routes[route_name].append(group_stopTimes[0]['stopId'])

    for idx, stop in enumerate(group_stopTimes[1:]):

        # calculating the travel time between stops

        prev_depart_time =group_stopTimes[idx]['departureTime']
        prev_stopId = group_stopTimes[idx]['stopId']

        cur_arrive_time = stop['arrivalTime']
        cur_stopId = stop['stopId']
        updated_routes[route_name].append(cur_stopId)
        

        
        
        edge_times[(prev_stopId, cur_stopId)] = cur_arrive_time - prev_depart_time
    
    return edge_times
    

In [20]:
print(updated_routes)

{}


In [None]:
async def get_stop_group_times(stop_groups, route_id):
    '''
    getting route schedule for each stop group
    edge_data format = {
        (stopId, stopId2) : {est travel time from stopId to stopId2 in seconds}
    }
    '''
    edge_data = {}

    try:
        route_schedule = await get_route_schedule(route_id)
    except Exception as e:
        print(f"Error: {e}")
        return
    
    
    for route_name, group in list(stop_groups.items()):
        edge_data.update(get_edge_times(group[0], route_schedule, route_id, route_name, group)) #get travel time for route where first stop matches group[0]
    
    return edge_data


In [None]:
#getting a dict of all routes 
all_route_lists = {}
est_stop_times = {}
fails = []

# going through all routes and determining their stop order + schedule
for i in tqdm(range(len(all_routes)), desc="fetching routes"):
    route_id = shorten_id(all_routes[i]['id'])

    try:
        route_data = await get_bus_route(route_id)
        
    except Exception as e:
        fails.append(route_id)
        continue
    
    #extracting routes (stop groups) from data
    stop_groups = get_stop_groups(route_data)
    #adding route to dictionary
    all_route_lists.update(stop_groups)

    #getting a list of route times for each edge in the routes
    route_times = await get_stop_group_times(stop_groups, route_id)
    #print(route_times)
    est_stop_times.update(route_times)

    
print(f"failed to fetch {len(fails)} routes: {fails}")
    

  route_data = await get_bus_route(route_id)
fetching routes: 100%|██████████| 267/267 [04:55<00:00,  1.11s/it]

failed to fetch 12 routes: ['D90', 'B101', 'L90', 'B90', 'B94', 'B96', 'B98', 'Q90', 'BX92', 'Q107', 'Q108', 'M90']





In [23]:
all_route_lists.update(updated_routes)
print(len(all_route_lists))
print(unknown_routes)
print(len(est_stop_times))



504
defaultdict(<class 'set'>, {'S61': {'S61 - ST GEORGE FERRY'}, 'SIM15': {'SIM15 - DOWNTOWN LOOP via CHURCH ST via WATER ST'}, 'SIM10': {'SIM10 - MIDTOWN via 23 ST via 6 AV'}, 'SIM11': {'SIM11 - MIDTOWN via 23 ST via MADISON AV'}, 'SIM9': {'SIM9 - GREENWICH VILL via WEST ST via 6 AV'}, 'SIM8': {'SIM8 - MIDTOWN via 42 ST via MADISON AV'}, 'SIM7': {'SIM7 - GREENWICH VILLAGE via WEST ST via 6 AV'}, 'SIM6': {'SIM6 - MIDTOWN via 23 ST via MADISON AV'}, 'SIM5': {'SIM5 - DOWNTOWN FRANKFORT ST via WATER ST', 'SIM5 - ELTVLLE TRANS CTR via F CAP BL via GFRDS'}, 'SIM4': {'SIM4 - DOWNTOWN WORTH ST via CHURCH ST'}, 'SIM3': {'SIM3 - MIDTOWN via 23 ST via 6 AV'}, 'SIM2': {'SIM2 - DOWNTOWN WORTH ST via CHURCH ST'}, 'SIM1': {'SIM1 - SOHO HOUSTON ST via CHURCH ST'}, 'SIM24': {'SIM24 - MIDTOWN via 34 ST via MADISON AV', 'SIM24 - PRINCES BAY via HUGUENOT AV'}, 'SIM25': {'SIM25 - MIDTOWN via 42 ST via MADISON AV'}, 'S74': {'S74 - ST GEORGE FERRY'}, 'SIM26': {'SIM26 - MIDTOWN via 42 ST via MADISON AV'}, '

### Need to get all missing stops into stations.graphml

Not all of the bus IDs in the schedules match the bus ID I already have, so I'll go through each bus ID in each route and add it to the bus info dict + Graph if it's missing

In [None]:
async def get_stop_info(stop_id):
        ''' getting stop info for a missing stop so I can add its info to the bus_info_dict
        '''
        url = f"https://bustime.mta.info/api/where/stop/{stop_id}.json?key={MTA_API_KEY}"
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                return await response.json()

In [25]:
def findNearestNode2Placemark(G, placemark_info):
    '''
    get the ID of the node nearest to a placemark
    '''
    placemark_longitude, placemark_latitude = placemark_info['longitude'], placemark_info['latitude']

    nearest_node = ox.distance.nearest_nodes(G,
                                         placemark_longitude, placemark_latitude,
                                         return_dist=True)
    
    return nearest_node[0]

In [None]:
async def add_info(stop_id, bus_info_dict, G):
    ''' 
    get bus stop info and append it to bus info dict + graph
    '''
    info = {}
    #add data retrieved online ab a stop id
    response = await get_stop_info(stop_id)
    info['name'] = response['data']['name']
    info['direction'] = response['data']['direction']
    info['id'] = stop_id
    info['longitude'] = response['data']['lon']
    info['latitude'] = response['data']['lat']
    info['osmid'] = findNearestNode2Placemark(G, info)
    if 'bus_stops' not in G.nodes[info['osmid']]:
        G.nodes[info['osmid']]['bus_stops'] = []
    G.nodes[info['osmid']]['bus_stops'].append(stop_id)
    bus_info_dict[stop_id] = info
    print(bus_info_dict[stop_id])
    print(G.nodes[info['osmid']])

    return bus_info_dict
    
    
    




In [27]:
# example info entry
# "MTA_308248": {
#         "name": "4 AV/SENATOR ST",
#         "routes": [
#             "B9"
#         ],
#         "direction": "S",
#         "link": "https://bustime.mta.info/m/index?q=308248",
#         "id": "MTA_308248",
#         "latitude": 40.636048,
#         "longitude": -74.022991,
#         "osmid": 42521235

In [None]:

G = ox.io.load_graphml('./graph_data/stations.graphml')

In [None]:
# go through each route, and if a route ID isn't in bus_info_dict, add it to dict and graph
for route_list in all_route_lists.values():
    for stop in route_list:
        if stop not in bus_info:
            bus_info = await add_info(stop, bus_info, G)

filename = "./graph_data/bus_info_dict.json"
with open(filename, 'w') as file:
    json.dump(bus_info, file, indent=4)
ox.io.save_graphml(G, filepath='./graph_data/stations.graphml')

In [30]:
# getting data given a route id
async def get_closest_stops(lon, lat, radius):
    url = f"https://bustime.mta.info/api/where/stops-for-location.json?key={MTA_API_KEY}&lat={lat}&lon={lon}&radius={radius}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response
    else:
        raise Exception(f"Error: {response['code']}")

In [32]:
# get schedule info and convert it to edge travel time
# https://api.pugetsound.onebusaway.org/api/where/schedule-for-route/1_100223.json?key=TEST

In [33]:
filename = "./graph_data/all_route_lists.json"
with open(filename, 'w') as file:
    json.dump(all_route_lists, file, indent=4)

### I ended up with with a lot of missing route schedules, so I'll need to do some error correcting

In [34]:
file_path = '/Users/charlierothschild/Desktop/DSA/PublicTransitOptimizer/map2graph/graph_data/bus_info_dict.json'
with open(file_path) as json_file:
    bus_info_dict = json.load(json_file)

In [47]:
G = ox.io.load_graphml('./graph_data/stations.graphml')

In [None]:
from geopy.distance import geodesic
# for getting euclidean distance between two long/lat coordinates

In [49]:
def estimate_route_time(G, diroute, all_route_lists, bus_info_dict):
    ''' 
    Make estimation for all nodes in a particular directed route
    '''

    #getting all nodes associated with each bus stop
    stops = all_route_lists[diroute]
    nodes = list(map(lambda x: bus_info_dict[x]['osmid'], stops))
    edge_times = {}
    for idx in range(len(nodes[1:])):
        prevnode = nodes[idx]
        curnode = nodes[idx + 1]

        prevstop = stops[idx]
        curstop = stops[idx + 1]
        # find shortest path between prevnode and curnode
        shortest_path = ox.routing.shortest_path(G, prevnode, curnode)
        #get the travelt time of each path
        

        #if a shortest path can't be found
        if shortest_path == None:
            # calculate euclidean distance between prevnode and curnode
            prevnode_loc = (G.nodes[prevnode]['x'], G.nodes[prevnode]['y'])
            curnode_loc = (G.nodes[curnode]['x'], G.nodes[curnode]['y'])
            
            dist = geodesic(prevnode_loc, curnode_loc).km
            
            #divide distance by average bus speed
            avg_bus_speed_kph = 13.14834

            travel_time = dist/avg_bus_speed_kph * 3600

            #add to dict
            edge_times[(prevstop, curstop)] = travel_time

            continue

        total_travel_time = 0

        for idx, curnode in enumerate(shortest_path[1:]):
            edge = (shortest_path[idx], curnode, 0)
            total_travel_time  += G.edges[edge]['travel_time']

        edge_times[(prevstop, curstop)] = total_travel_time
    return edge_times

        
            



def estimate_missing_edge_times(G, unknown_routes, est_stop_times, all_route_lists, bus_info_dict):
    '''
    Go through each missing route, go through each pair of nodes in that route and estimate how long it would take to get a bus to get between those two points
    make estimation by first attempting to find a route between the pair of nodes, sum the total length of the path, and dividing that length by the average travel speed of an MTA bus
    If no route can be found, make estimation based on euclidean distance
    '''
    


    #loop through each missing route
    for route in unknown_routes.values():
        # loop through each directed route:
        for diroute in route:
            est_stop_times.update(estimate_route_time(G, diroute, all_route_lists, bus_info_dict))


In [52]:
#takes ab 6 minutes to find shortest paths between all nodes
estimate_missing_edge_times(G, unknown_routes, est_stop_times, all_route_lists, bus_info_dict)

In [None]:
# how many stop edges do we have?
print(len(est_stop_times))

12601


In [None]:
# saving est travel time to file
def save_dict_with_tuple_keys(data, filename):
    """Saves a dictionary with tuple keys to a JSON file."""
    
    stringified_keys_data = {str(key): value for key, value in data.items()}
    with open(filename, 'w') as f:
        json.dump(stringified_keys_data, f)

# loads est stop times from a json file
def load_dict_with_tuple_keys(filename):
    """Loads a dictionary with tuple keys from a JSON file."""

    with open(filename, 'r') as f:
        stringified_keys_data = json.load(f)
    
    original_keys_data = {eval(key): value for key, value in stringified_keys_data.items()}
    return original_keys_data



In [55]:
filename = './graph_data/est_travel_times.json'
save_dict_with_tuple_keys(est_stop_times, filename)

### adding station edges to graph

In [56]:
#loop through each edge in est_stop_times, and add it to the graph
for stops, time in est_stop_times.items():
    prevosmid = bus_info_dict[stops[0]]['osmid']
    curosmid = bus_info_dict[stops[1]]['osmid']
    G.add_edge(prevosmid, curosmid, key=0, travel_time=time)

#add lengths to new edges
ox.routing.add_edge_speeds(G)
ox.distance.add_edge_lengths(G)

<networkx.classes.multidigraph.MultiDiGraph at 0x36b7d1eb0>

In [57]:
#updating graphml
ox.io.save_graphml(G, filepath='./graph_data/stations.graphml')