In [68]:
import requests
import json
import os
from dotenv import load_dotenv
import osmnx as ox

In [69]:
import asyncio
import aiohttp
    

In [70]:
load_dotenv()  # take environment variables

True

In [71]:
MTA_API_KEY = os.getenv("MTA_API_KEY")

In [72]:
# getting data given a route id
async def get_bus_route(route_id):
    url = f"https://bustime.mta.info/api/where/stops-for-route/MTA%20NYCT_{route_id}.json?key={MTA_API_KEY}&includePolylines=false&version=2"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response
    else:
        raise Exception(f"Error: bus route {route_id} returned response code {response['code']}")

In [73]:
route_id = "B52" # Example route
route_data = await get_bus_route(route_id)
print(json.dumps(route_data, indent=4))



{
    "code": 200,
    "currentTime": 1747088374707,
    "data": {
        "entry": {
            "polylines": [],
            "routeId": "MTA NYCT_B52",
            "stopGroupings": [
                {
                    "ordered": true,
                    "stopGroups": [
                        {
                            "id": "1",
                            "name": {
                                "name": "DOWNTOWN BKLYN TILLARY ST via GATES",
                                "names": [
                                    "DOWNTOWN BKLYN TILLARY ST via GATES"
                                ],
                                "type": "destination"
                            },
                            "polylines": [],
                            "stopIds": [
                                "MTA_504965",
                                "MTA_504119",
                                "MTA_304196",
                                "MTA_304197",
                                "MT

In [74]:
# getting data given a route id
async def get_route_schedule(route_id):
    url = f"https://bustime.mta.info/api/where/trips-for-route/MTA%20NYCT_{route_id}.json?key={MTA_API_KEY}&includeSchedule=true"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response['data']['list']
    else:
        raise Exception(f"Error: bus stop {route_id} returned response code {response['code']}")

In [75]:
stop_data = await get_route_schedule(route_id)
print(json.dumps(stop_data, indent=4))

[
    {
        "frequency": null,
        "schedule": {
            "frequency": null,
            "nextTripId": "MTA NYCT_FP_B5-Weekday-SDon-113100_B26_232",
            "previousTripId": "MTA NYCT_FP_B5-Weekday-SDon-098400_B26_232",
            "stopTimes": [
                {
                    "arrivalTime": 63600,
                    "departureTime": 63600,
                    "distanceAlongTrip": 0.08656897032969031,
                    "stopHeadsign": "",
                    "stopId": "MTA_307175"
                },
                {
                    "arrivalTime": 63720,
                    "departureTime": 63720,
                    "distanceAlongTrip": 285.61860445152655,
                    "stopHeadsign": "",
                    "stopId": "MTA_307699"
                },
                {
                    "arrivalTime": 63772,
                    "departureTime": 63772,
                    "distanceAlongTrip": 409.6188950017041,
                    "stopHeadsign": ""

In [76]:
def shorten_id(route_id):
    shortened_id = route_id.split('_')[1]
    return shortened_id

In [77]:
def get_stop_groups(route_data):
    if not route_data['data']['entry']['stopGroupings'][0]['ordered']:
        print(f"{route_data['routeId']} not ordered")
        return {}
    
    route_lists = {}
    stopGroups = route_data['data']['entry']['stopGroupings'][0]['stopGroups']
    route_id = shorten_id(route_data['data']['entry']['routeId'])
    for group in stopGroups:

        route_lists[f'{route_id} - {group['name']['name']}'] = group['stopIds']
        #print(group['stopIds'][0])
    
    return route_lists

    

In [78]:
test_route_lists = get_stop_groups(route_data)

In [79]:
json_path='./graph_data/bus_info_dict.json'
with open(json_path) as json_file:
    bus_info = json.load(json_file)

In [80]:
print(bus_info['MTA_901601'])

{'name': 'Brooklyn Bridge Park /PIER 6', 'routes': ['B63'], 'direction': 'W', 'link': 'https://bustime.mta.info/m/index?q=901601', 'id': 'MTA_901601', 'latitude': 40.693078, 'longitude': -74.000877, 'osmid': 1567286111, 'diroutes': ['BAY RIDGE SHORE RD via 5 AV']}


In [81]:
json_path='./graph_data_archive/all_routes.json'
with open(json_path) as json_file:
    all_routes = json.load(json_file)

In [82]:
print(len(all_routes))

267


In [83]:
shorten_id(all_routes[0]['id'])

'M34+'

In [84]:
route_data = get_bus_route('B52')
print(route_data)

<coroutine object get_bus_route at 0x329cb3740>


In [85]:
from tqdm import tqdm
from collections import defaultdict
#loading bar module

In [86]:
unknown_routes = defaultdict(set)
updated_routes = {}

def get_edge_times(first_stop, route_schedule, route_id, route_name, group):
    '''
    give first stop of a route and possible route schedules, find the time it takes to get between each stop
    '''
    edge_times = {}
    schedule_found = False
    group_stopTimes = None

    #the stopIds don't always match up with the ids I have stored, so I'll store a list of updated routes
    updated_routes[route_name] = []
    # if route_name == 'RIDGEWOOD TERM via MYRTLE':
    #         print('hi')

    for schedule in route_schedule:
        #going through each schedule and seeing if one of the first stop matches the first stop in the stored route
        if first_stop == schedule['schedule']['stopTimes'][0]['stopId']:
            schedule_found = True
            group_stopTimes= schedule['schedule']['stopTimes']
            break

    if not schedule_found:
        #print(f'schedule for {first_stop} not found - Route: {route_id}')
        # a bunch of routes didn't have the first stop matching the first stop in my route list, so 
        
        unknown_routes[route_id].add(route_name)
        
        return {}
    
    #looping through each stop (except for the first one) and recording num seconds between the current stop and the previous stop
    updated_routes[route_name].append(group_stopTimes[0]['stopId'])

    for idx, stop in enumerate(group_stopTimes[1:]):

        prev_depart_time =group_stopTimes[idx]['departureTime']
        prev_stopId = group_stopTimes[idx]['stopId']

        cur_arrive_time = stop['arrivalTime']
        cur_stopId = stop['stopId']
        updated_routes[route_name].append(cur_stopId)
        # if cur_stopId == 'MTA_901771':
        #     print((route_name, route_id))
        #     print(updated_routes[route_name])
        

        
        
        edge_times[(prev_stopId, cur_stopId)] = cur_arrive_time - prev_depart_time
    
    return edge_times
    

In [87]:
async def get_stop_group_times(stop_groups, route_id):
    '''
    getting route schedule for each stop group
    edge_data format = {
        (stopId, stopId2) : {est travel time from stopId to stopId2 in seconds}
    }
    '''
    edge_data = {}

    try:
        route_schedule = await get_route_schedule(route_id)
    except Exception as e:
        print(f"Error: {e}")
        return

    for route_name, group in list(stop_groups.items()):
        edge_data.update(get_edge_times(group[0], route_schedule, route_id, route_name, group)) #get travel time for route where first stop matches group[0]
    
    return edge_data


In [88]:
#getting a dict of all routes 
all_route_lists = {}
est_stop_times = {}
fails = []
for i in tqdm(range(len(all_routes)), desc="fetching routes"):
    route_id = shorten_id(all_routes[i]['id'])

    try:
        route_data = await get_bus_route(route_id)
        
    except Exception as e:
        fails.append(route_id)
        continue
    
    #extracting routes (stop groups) from data
    stop_groups = get_stop_groups(route_data)
    #adding route to dictionary
    all_route_lists.update(stop_groups)

    #getting a list of route times for each edge in the routes
    route_times = await get_stop_group_times(stop_groups, route_id)
    #print(route_times)
    est_stop_times.update(route_times)

    
print(f"failed to fetch {len(fails)} routes: {fails}")
    

  route_data = await get_bus_route(route_id)
fetching routes: 100%|██████████| 267/267 [05:01<00:00,  1.13s/it]

failed to fetch 12 routes: ['D90', 'B101', 'L90', 'B90', 'B94', 'B96', 'B98', 'Q90', 'BX92', 'Q107', 'Q108', 'M90']





In [89]:
all_route_lists.update(updated_routes)
print(len(all_route_lists))
print(unknown_routes)
print(len(est_stop_times))



504
defaultdict(<class 'set'>, {'S61': {'S61 - ST GEORGE FERRY'}, 'SIM15': {'SIM15 - DOWNTOWN LOOP via CHURCH ST via WATER ST'}, 'SIM11': {'SIM11 - MIDTOWN via 23 ST via MADISON AV'}, 'SIM9': {'SIM9 - GREENWICH VILL via WEST ST via 6 AV'}, 'SIM8': {'SIM8 - MIDTOWN via 42 ST via MADISON AV'}, 'SIM7': {'SIM7 - GREENWICH VILLAGE via WEST ST via 6 AV'}, 'SIM6': {'SIM6 - MIDTOWN via 23 ST via MADISON AV'}, 'SIM5': {'SIM5 - DOWNTOWN FRANKFORT ST via WATER ST'}, 'SIM4': {'SIM4 - DOWNTOWN WORTH ST via CHURCH ST'}, 'SIM2': {'SIM2 - DOWNTOWN WORTH ST via CHURCH ST'}, 'SIM1': {'SIM1 - SOHO HOUSTON ST via CHURCH ST'}, 'SIM24': {'SIM24 - MIDTOWN via 34 ST via MADISON AV'}, 'SIM25': {'SIM25 - MIDTOWN via 42 ST via MADISON AV'}, 'S74': {'S74 - ST GEORGE FERRY'}, 'SIM26': {'SIM26 - MIDTOWN via 42 ST via MADISON AV'}, 'L91': {'L91 - L TRAIN TO CANARSIE', 'L91 - L TRAIN TO BROADWAY JCT'}, 'SIM22': {'SIM22 - MIDTOWN via 42 ST via MADISON AV'}, 'SIM23': {'SIM23 - MIDTOWN via 34 ST via MADISON AV'}, 'L92':

### Need to get all missing stops into stations.graphml

In [90]:
async def get_stop_info(stop_id):
        url = f"https://bustime.mta.info/api/where/stop/{stop_id}.json?key={MTA_API_KEY}"
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                return await response.json()

In [91]:
def findNearestNode2Placemark(G, placemark_info):
    '''
    get the ID of the node nearest to a placemark
    '''
    placemark_longitude, placemark_latitude = placemark_info['longitude'], placemark_info['latitude']

    nearest_node = ox.distance.nearest_nodes(G,
                                         placemark_longitude, placemark_latitude,
                                         return_dist=True)
    
    return nearest_node[0]

In [92]:
async def add_info(stop_id, bus_info_dict, G):
    info = {}
    #add data retrieved online ab a stop id
    response = await get_stop_info(stop_id)
    info['name'] = response['data']['name']
    info['direction'] = response['data']['direction']
    info['id'] = stop_id
    info['longitude'] = response['data']['lon']
    info['latitude'] = response['data']['lat']
    info['osmid'] = findNearestNode2Placemark(G, info)
    if 'bus_stops' not in G.nodes[info['osmid']]:
        G.nodes[info['osmid']]['bus_stops'] = []
    G.nodes[info['osmid']]['bus_stops'].append(stop_id)
    bus_info_dict[stop_id] = info
    print(bus_info_dict[stop_id])
    print(G.nodes[info['osmid']])

    return bus_info_dict
    
    
    




In [93]:
# example info entry
# "MTA_308248": {
#         "name": "4 AV/SENATOR ST",
#         "routes": [
#             "B9"
#         ],
#         "direction": "S",
#         "link": "https://bustime.mta.info/m/index?q=308248",
#         "id": "MTA_308248",
#         "latitude": 40.636048,
#         "longitude": -74.022991,
#         "osmid": 42521235

In [94]:
G = ox.io.load_graphml('./graph_data/stations.graphml')

In [95]:

for route_list in all_route_lists.values():
    for stop in route_list:
        if stop not in bus_info:
            bus_info = await add_info(stop, bus_info, G)

filename = "./graph_data/bus_info_dict.json"
with open(filename, 'w') as file:
    json.dump(bus_info, file, indent=4)
ox.io.save_graphml(G, filepath='./graph_data/stations.graphml')

In [96]:
# getting data given a route id
async def get_closest_stops(lon, lat, radius):
    url = f"https://bustime.mta.info/api/where/stops-for-location.json?key={MTA_API_KEY}&lat={lat}&lon={lon}&radius={radius}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response = await response.json()
    if response['code'] == 200:
        return response
    else:
        raise Exception(f"Error: {response['code']}")

In [97]:
async def hi():
    closest_stops = await get_closest_stops(-73.946764,40.6319761, 200)
    print(closest_stops)

await hi()

{'code': 200, 'currentTime': 1747088687454, 'data': {'limitExceeded': False, 'stops': [{'code': '303232', 'direction': 'NW', 'id': 'MTA_303232', 'lat': 40.632805, 'locationType': 0, 'lon': -73.947493, 'name': 'FLATBUSH AV/NOSTRAND AV', 'routes': [{'agency': {'disclaimer': '', 'email': '', 'fareUrl': '', 'id': 'MTA NYCT', 'lang': 'en', 'name': 'MTA New York City Transit', 'phone': '718-330-1234', 'privateService': False, 'timezone': 'America/New_York', 'url': 'http://www.mta.info'}, 'color': '006CB7', 'description': 'via Flatbush Av / Livingston St', 'id': 'MTA NYCT_B41', 'longName': 'Kings Plaza - Downtown Brooklyn', 'shortName': 'B41', 'textColor': 'FFFFFF', 'type': 3, 'url': ''}], 'wheelchairBoarding': 'UNKNOWN'}, {'code': '303317', 'direction': 'SE', 'id': 'MTA_303317', 'lat': 40.631995, 'locationType': 0, 'lon': -73.946898, 'name': 'FLATBUSH AV/NOSTRAND AV', 'routes': [{'agency': {'disclaimer': '', 'email': '', 'fareUrl': '', 'id': 'MTA NYCT', 'lang': 'en', 'name': 'MTA New York Ci

In [98]:
# get schedule info and convert it to edge travel time
# https://api.pugetsound.onebusaway.org/api/where/schedule-for-route/1_100223.json?key=TEST

In [99]:
filename = "./graph_data/all_route_lists.json"
with open(filename, 'w') as file:
    json.dump(all_route_lists, file, indent=4)

### I ended up with with a lot of missing route schedules, so I'll need to do some error correcting

In [100]:
file_path = '/Users/charlierothschild/Desktop/DSA/PublicTransitOptimizer/map2graph/graph_data/bus_info_dict.json'
with open(file_path) as json_file:
    bus_info_dict = json.load(json_file)

In [101]:
G = ox.io.load_graphml('./graph_data/stations.graphml')

In [102]:
from geopy.distance import geodesic

In [103]:
def estimate_route_time(G, diroute, all_route_lists, bus_info_dict):
    ''' 
    Make estimation for all nodes in a particular directed route
    '''

    #getting all nodes associated with each bus stop
    stops = all_route_lists[diroute]
    nodes = list(map(lambda x: bus_info_dict[x]['osmid'], stops))
    edge_times = {}
    for idx in range(len(nodes[1:])):
        prevnode = nodes[idx]
        curnode = nodes[idx + 1]

        prevstop = stops[idx]
        curstop = stops[idx + 1]
        # find shortest path between prevnode and curnode
        shortest_path = ox.routing.shortest_path(G, prevnode, curnode)
        #get the travelt time of each path
        

        #if a shortest path can't be found
        if shortest_path == None:
            # calculate euclidean distance between prevnode and curnode
            prevnode_loc = (G.nodes[prevnode]['x'], G.nodes[prevnode]['y'])
            curnode_loc = (G.nodes[curnode]['x'], G.nodes[curnode]['y'])
            
            dist = geodesic(prevnode_loc, curnode_loc).km
            
            #divide distance by average bus speed
            avg_bus_speed_kph = 13.14834

            travel_time = dist/avg_bus_speed_kph * 3600

            #add to dict
            edge_times[(prevstop, curstop)] = travel_time

            continue

        total_travel_time = 0

        for idx, curnode in enumerate(shortest_path[1:]):
            edge = (shortest_path[idx], curnode, 0)
            total_travel_time  += G.edges[edge]['travel_time']

        edge_times[(prevstop, curstop)] = total_travel_time
    return edge_times

        
            



def estimate_missing_edge_times(G, unknown_routes, est_stop_times, all_route_lists, bus_info_dict):
    '''
    Go through each missing route, go through each pair of nodes in that route and estimate how long it would take to get a bus to get between those two points
    make estimation by first attempting to find a route between the pair of nodes, sum the total length of the path, and dividing that length by the average travel speed of an MTA bus
    If no route can be found, make estimation based on euclidean distance
    '''
    ox.routing.add_edge_speeds(G)
    ox.routing.add_edge_travel_times(G)


    #loop through each missing route
    for route in unknown_routes.values():
        # loop through each directed route:
        for diroute in route:
            est_stop_times.update(estimate_route_time(G, diroute, all_route_lists, bus_info_dict))


In [104]:
#takes ab 6 minutes to find shortest paths between all nodes
estimate_missing_edge_times(G, unknown_routes, est_stop_times, all_route_lists, bus_info_dict)

In [105]:
print(len(est_stop_times))

11738


In [106]:
#saving est travel time to file
def save_dict_with_tuple_keys(data, filename):
    """Saves a dictionary with tuple keys to a JSON file."""
    
    stringified_keys_data = {str(key): value for key, value in data.items()}
    with open(filename, 'w') as f:
        json.dump(stringified_keys_data, f)

#loads est stop times from a json file
def load_dict_with_tuple_keys(filename):
    """Loads a dictionary with tuple keys from a JSON file."""

    with open(filename, 'r') as f:
        stringified_keys_data = json.load(f)
    
    original_keys_data = {eval(key): value for key, value in stringified_keys_data.items()}
    return original_keys_data



In [107]:
filename = './graph_data/est_travel_times.json'
save_dict_with_tuple_keys(est_stop_times, filename)

### adding station edges to graph

In [108]:
for stops, time in est_stop_times.items():
    prevosmid = bus_info_dict[stops[0]]['osmid']
    curosmid = bus_info_dict[stops[1]]['osmid']
    print((prevosmid, curosmid))


(42437678, 42430633)
(42430633, 42437670)
(42437670, 42437663)
(42437663, 42435644)
(42435644, 42437654)
(42437654, 42430304)
(42430304, 42437644)
(42437644, 3786901743)
(3786901743, 42445382)
(42445382, 5131026388)
(5131026388, 42442898)
(42442898, 42442889)
(42442889, 42448707)
(42448707, 42448714)
(42448714, 370913758)
(42422509, 42457925)
(42457925, 42445390)
(42445390, 5131026388)
(5131026388, 42445378)
(42445378, 561042190)
(561042190, 42437644)
(42437644, 42430304)
(42430304, 42437654)
(42437654, 42435644)
(42435644, 42435650)
(42435650, 42435660)
(42435660, 5849918504)
(5849918504, 42445011)
(42870690, 3325564444)
(3325564444, 42511980)
(42511980, 42536224)
(42536224, 42496742)
(42496742, 42486774)
(42486774, 42499843)
(42499843, 42513780)
(42513780, 42505527)
(42505527, 42493905)
(42493905, 42485016)
(42485016, 42499494)
(42499494, 42473259)
(42473259, 42466967)
(42466967, 42516267)
(42516267, 42499192)
(42499192, 42514378)
(42514378, 42482354)
(42482354, 42481580)
(42481580, 

In [109]:
print(updated_routes)

{'M34+ - SELECT BUS EAST SIDE FDR DR CROSSTOWN': ['MTA_405286', 'MTA_405587', 'MTA_403192', 'MTA_401818', 'MTA_401819', 'MTA_404280', 'MTA_401821', 'MTA_401822', 'MTA_401824', 'MTA_401826', 'MTA_401827', 'MTA_405336', 'MTA_401832', 'MTA_401833', 'MTA_402052', 'MTA_903027'], 'M34+ - SELECT BUS JAVITS CTR 12 AV CROSSTOWN': ['MTA_401857', 'MTA_401858', 'MTA_403359', 'MTA_403216', 'MTA_401840', 'MTA_401842', 'MTA_405375', 'MTA_400716', 'MTA_401847', 'MTA_401231', 'MTA_401232', 'MTA_401850', 'MTA_404116', 'MTA_405412'], 'B52 - DOWNTOWN BKLYN TILLARY ST via GATES': ['MTA_504965', 'MTA_504119', 'MTA_304196', 'MTA_304197', 'MTA_304198', 'MTA_304199', 'MTA_304200', 'MTA_304201', 'MTA_308046', 'MTA_304203', 'MTA_304204', 'MTA_304205', 'MTA_304206', 'MTA_304207', 'MTA_306530', 'MTA_304209', 'MTA_304210', 'MTA_304211', 'MTA_304212', 'MTA_304213', 'MTA_304214', 'MTA_307948', 'MTA_307572', 'MTA_307573', 'MTA_307596', 'MTA_307575', 'MTA_307576', 'MTA_307577', 'MTA_302427', 'MTA_302428', 'MTA_302430',

In [110]:
json_path='./graph_data/all_route_lists.json'
with open(json_path) as json_file:
    all_route_lists = json.load(json_file)