# Adding New York bus routes to OSMnx map

This code is for extracting the locations of bus routes on the map and adding them to nodes

## Section 1: Extracting bus stops from KML
KML source: 
<a href="https://www.google.com/maps/d/u/0/viewer?mid=1Y-euNeFcsu06Zxfdl6u6-sca3Yp-KYY&ll=40.75502933824687%2C-74.00066380083778&z=18">
MTA Bus Map
</a>

### Opening KML file and extracting placemarks

In [None]:
from fastkml import KML
from fastkml import Placemark, Point, StyleUrl, Style
from fastkml.utils import find, find_all
import os

In [None]:
#read .kml file as KML object with fastkml
k = KML.parse("../data/doc.kml")

In [None]:
#extract all placemarks in .kml file
placemarks = list(find_all(k, of_type=Placemark))

In [None]:
#print the number of placemarks
print(len(placemarks))

In [None]:
print(placemarks[0].description)

### Parsing Placemark Description

In [None]:
#extravcting info from placemarker description

def parse_placemark_info(desc_str):
    """
    Parse a description string like:
      "name: HYLAN BLVD/MIDLAND AV<br>"
      "routes: S51, S78, S79+, S81<br>"
      "direction: SW<br>"
      "link: https://…<br>"
      "id: MTA_200237<br>"
      "Latitude,Longitude: 40.577699, -74.102611"
    into a dict with proper types.
    """
    # 1) split into lines
    parts = desc_str.split('<br>')
    
    data = {}
    for part in parts:
        if not part.strip():
            continue
        # split on the first ": "
        key, val = part.split(': ', 1)
        data[key.strip()] = val.strip()
    
    # 2) post‑process some fields:
    #   routes → list of route codes
    if 'routes' in data:
        data['routes'] = [r.strip() for r in data['routes'].split(',')]
    
    #   Latitude,Longitude → two floats
    if 'Latitude,Longitude' in data:
        lat_str, lon_str = data['Latitude,Longitude'].split(',', 1)
        data['latitude']  = float(lat_str)
        data['longitude'] = float(lon_str)
        # optional: you can delete the original key
        del data['Latitude,Longitude']
    
    return data

In [None]:
# testing
test_placemark_info = parse_placemark_info(placemarks[0].description)
test_placemark_info

placemark.description seems to have all the data we might need for the OSM

## Section 2: Adding placemark data to OSMNX

In [None]:
import multiprocessing as mp
import numpy as np
import osmnx as ox
from matplotlib import pyplot as plt
import networkx as nx

ox.__version__

### Converting new york digraph to GeodataFrame

In [None]:
# open New York
place = "New York, New York"
G = ox.graph.graph_from_place(place, network_type="drive")
Gp = ox.projection.project_graph(G)

Getting bus stops

Should I use openstreetmap's data or just add my info to each node by using kd tree? <br>
I'll use a KD tree because this is a DSA project <br>

In [None]:
# convert drive multidigraph nodes to geodataframe
gdf_nodes = ox.convert.graph_to_gdfs(
    G, nodes=True, edges=False, node_geometry=True,
    fill_edge_geometry=False)

In [None]:
# # display it on map
# gdf_nodes.explore()

The drive network loads much faster than the entire open street map network

### Locating the node closest to a placemarker

In [None]:
def findNearestNode2Placemark(G, placemark_info):
    '''
    get the ID of the node nearest to a placemark
    '''
    placemark_longitude, placemark_latitude = placemark_info['longitude'], placemark_info['latitude']

    nearest_node = ox.distance.nearest_nodes(G,
                                         placemark_longitude, placemark_latitude,
                                         return_dist=True)
    
    return nearest_node

In [None]:
#finding node nearest to placemark[0]
nearest_node = findNearestNode2Placemark(G, test_placemark_info)

In [None]:
# fetch the node geometry based on node id
nearest_node_id = nearest_node[0]
gdf_nodes.loc[nearest_node_id]

In [None]:
# visualizing node nearest to placemarker
import geopandas as gpd
from shapely import (
    Point, LineString)

# create a new geodataframe with the nearest node and new point
nearest_node_dict = {'col1': ['Bus Station', 'Nearest Node'],
                     'geometry': [Point(test_placemark_info['longitude'], test_placemark_info['latitude']),
                                        LineString([
                                      Point(gdf_nodes.loc[nearest_node_id].x,
                                            gdf_nodes.loc[nearest_node_id].y),
                                      Point(test_placemark_info['longitude'], test_placemark_info['latitude'])])]}
# convert dictionary to geodataframe
nearest_node_gdf = gpd.GeoDataFrame(nearest_node_dict, crs="EPSG:4326")
# nearest node map reference
nearest_node_map = nearest_node_gdf.explore(color="red")
# # combine nearest node with existing node map
# gdf_nodes.explore(m=nearest_node_map)

### adding placemark info to gdf node

In [None]:
def addPlacemark2Graph(G, node_id, placemark_info):
    ''' 
    add the placemark id to our multidigraph
    '''
    print(placemark_info)
    attributes = {node_id : placemark_info}
    nx.set_node_attributes(G, attributes)
    
    return G.nodes[node_id]

In [None]:
info = addPlacemark2Graph(G, nearest_node_id, test_placemark_info)
print(info)


In [None]:
import time
from tqdm import tqdm

In [None]:

#add descriptions to closest neighbor (warning: some bus stations may appear on the same node. Be sure to handle properly)
def get_attribute_list(G, placemarks):
    # extracts description dicts from a list of placemarks
    allDescriptions = list(map(lambda x: parse_placemark_info(x.description), placemarks))
    print(f'number of placemarks: {len(allDescriptions)}')


    #create a set containg each nearest node id
    start = time.time()
    nearest_node_ids = []

    #display a progress bar
    for i in tqdm(range(len(allDescriptions)), desc="finding neighbors"):
        nearest_node_ids.append(findNearestNode2Placemark(G, allDescriptions[i])[0])
    end = time.time()

    print(f'number of nodes: {len(set(nearest_node_ids))}')

    
    return list(zip(nearest_node_ids, allDescriptions))


In [None]:
#this code takes 20+ minutes to run
attribute_list = get_attribute_list(G, placemarks)
#ended up with one fewer node than descriptions... i think we're fine

In [None]:
#adding bus stations to each node
def add_path_attributes(G, attribute_list):
    #Giving each node an empty list as its bus values
    nx.set_node_attributes(G, [], 'bus_stops')
    for attributeTup in attribute_list:
        #add the bus id to a list of bus stops
        G.nodes[attributeTup[0]]['bus_stops'] = G.nodes[attributeTup[0]]['bus_stops'] + [attributeTup[1]['id']]

        #store the attribute dictionary the bus id 
        G.nodes[attributeTup[0]][attributeTup[1]['id']] = [attributeTup[1]]

In [None]:
#add bus stop attributes to each node with a bus stop
add_path_attributes(G, attribute_list)

In [None]:
G.nodes[list(attribute_list)[0][0]]

In [None]:
# convert drive multidigraph nodes to geodataframe
gdf_nodes = ox.convert.graph_to_gdfs(
    G, nodes=True, edges=False, node_geometry=True,
    fill_edge_geometry=False)

In [None]:
# gdf_nodes.explore(m=nearest_node_map)

In [None]:
#saving as gml
filepath = "./graph_data/bus_stops.graphml"
ox.io.save_graphml(G, filepath)
G = ox.io.load_graphml(filepath)

In [None]:
from pathlib import Path

In [None]:
Path("data").mkdir(parents=True, exist_ok=True)

In [None]:
# get all "amenities" and save as a geopackage via geopandas
gdf = ox.features.features_from_place(place, tags={"amenity": True})
gdf = gdf.apply(lambda c: c.astype(str) if c.name != "geometry" else c, axis=0)
gdf.to_file("./data/pois.gpkg", driver="GPKG")

In [None]:
# open New York
place = "New York, New York"
G = ox.graph.graph_from_place(place, network_type="drive")
Gp = ox.projection.project_graph(G)

In [None]:
NY_gdf_nodes, NY_gdf_edges = ox.convert.graph_to_gdfs(G)
NY_gdf_nodes.drop('geometry', axis=1)

In [None]:
NY_gdf_edges.head()
    

In [None]:
from collections import defaultdict
import pandas as pd
import geopandas as gpd

In [None]:
#adding bus stations to each node
def add_attributes_to_gdf(gdf, attribute_list):
    ''' 
    add a list of attributes to a gdf node - Idk why I used a geopandas data frame as an input but it works fine
    '''
    bus_node_dict = {}
    bus_info_dict = {}

    for attributretup in attribute_list:
        bus_info_dict[attributretup[1]['id']] = attributretup[1]
        

        if attributretup[0] in bus_node_dict:
            bus_node_dict[attributretup[0]].append(attributretup[1]['id'])
            continue

        bus_node_dict[attributretup[0]] = [attributretup[1]['id']]


    osmids = list(bus_node_dict.keys())
    stops = list(bus_node_dict.values())

    df_dict = {'osmid': osmids, 'stops': stops}
    
    return df_dict, bus_info_dict

In [None]:
attr = add_attributes_to_gdf(NY_gdf_nodes, attribute_list)
df = pd.DataFrame(attr[0])
df

In [None]:
#adding bus ids to all nodes associated with bus stations
bus_gdf = NY_gdf_nodes.merge(df, on='osmid',how='left')

In [None]:
#saving nodes to file
bus_gdf.to_file('./graph_data/gpkg/NY_gdf_nodes.gpkg')

In [None]:
import json

In [None]:
#saving edges to file
NY_gdf_edges.to_file('./graph_data/gpkg/NY_gdf_edges.gpkg')

In [None]:
#saving bus stop dict to file
filename = "graph_data/gpkg/bus_info_dict.json"
with open(filename, 'w') as file:
    json.dump(attr[1], file, indent=4)

print(f"Dictionary saved to {filename}")

In [None]:
#loading geopackages
loaded_nodes = gpd.read_file('./graph_data/gpkg/NY_gdf_nodes.gpkg')
loaded_edges = gpd.read_file('./graph_data/gpkg/NY_gdf_edges.gpkg')
json_path='./graph_data/gpkg/bus_info_dict.json'
with open(json_path) as json_file:
    loaded_dict = json.load(json_file)



### Saving Graph with bus stops to Graphml

In [None]:
# open New York
place = "New York, New York"
G = ox.graph.graph_from_place(place, network_type="drive")
Gp = ox.projection.project_graph(G)

In [None]:
#adding bus stations to each node
def add_attributes_to_multigraph(G, attribute_list):
    ''' 
    add bus IDs to multidigraph, and return attributes associated with each busID in a dict
    '''
    attrs = {}
    bus_info_dict = {}

    for attribute_tuple in attribute_list:
        attribute_tuple[1]['osmid'] = attribute_tuple[0]
        bus_info_dict[attribute_tuple[1]['id']] = attribute_tuple[1]
        

        if attribute_tuple[0] in attrs:
            attrs[attribute_tuple[0]]['bus_stops'].append(attribute_tuple[1]['id'])
            continue

        attrs[attribute_tuple[0]] = {'bus_stops': [attribute_tuple[1]['id']]}

    nx.set_node_attributes(G, attrs)
    
    return bus_info_dict

In [None]:
bus_info_dict = add_attributes_to_multigraph(G, attribute_list)

In [None]:
# save graph with bus stations to graphml file
ox.io.save_graphml(G, filepath='./graph_data/stations.graphml')
filename = "./graph_data/bus_info_dict.json"
with open(filename, 'w') as file:
    json.dump(attr[1], file, indent=4)


In [None]:
# loading my graph from a file - seeing if all the data is saved
G = ox.io.load_graphml(filepath='./graph_data/stations.graphml')

In [None]:
print(type(G))

In [None]:
# Making sure bus stop data is saved
list(G.nodes(data='bus_stops'))

In [None]:
#filtering out all nodes wihtout bus stops
bus_nodes = dict(filter(lambda x: x if x[1] != None else False, G.nodes(data='bus_stops') ))

In [None]:
bus_info_dict

In [None]:
def generate_route_dict(bus_info):
    ''' 
    get a list of all bus stops associated with each route - unordered
    '''
    route_dict = defaultdict(list)
    for key in bus_info.keys():
        for route in bus_info[key]['routes']:
            route_dict[route].append(key)
    return route_dict


In [None]:
# the route dict doesn't really matter at this moment because it isn't ordered. Ordered route dict retrieved in MTAapi.ipynb
route_dict = generate_route_dict(bus_info_dict)
print(route_dict['M79+'])

In [None]:
# saving route dict to file
filename = "./graph_data/bus_routes.json"
with open(filename, 'w') as file:
    json.dump(route_dict, file, indent=4)