# OSM Parsing

In [1]:
# imports 
import lxml.etree as ET # xml parser
import pandas as pd # convert to parquet

## Using osmium for filtering down dataset

In [2]:
# shows general metadata about file, including size, number of objects, and other useful information
!osmium fileinfo data/north-america-latest.osm.pbf

File:
  Name: data/north-america-latest.osm.pbf
  Format: PBF
  Compression: none
  Size: 16799084065
Header:
  Bounding boxes:
    (-180,5.57228,180,85.04177)
  With history: no
  Options:
    generator=osmium/1.14.0
    osmosis_replication_base_url=https://download.geofabrik.de/north-america-updates
    osmosis_replication_sequence_number=4339
    osmosis_replication_timestamp=2025-02-19T21:20:47Z
    pbf_dense_nodes=true
    pbf_optional_feature_0=Sort.Type_then_ID
    sorting=Type_then_ID
    timestamp=2025-02-19T21:20:47Z


Above gives information that:

- ~16.8 GB
- bounding box represents the geographic bounding box for the data (min and max latitudes and longitudes for area covered in file)
    - Longitude: -180° (west) to 180° (east)
    - Latitude: ~5.57° to ~85.04° (north-south)

In [3]:
# filters `north-america-latest.osm.pbf` file for: (SLO specific)

# entirety of SLO (bounded box around SLO) (LONG1,LAT1,LONG2,LAT2)
!osmium extract -b "-120.7,35.2,-120.5,35.4" data/north-america-latest.osm.pbf -o data/slo_places.osm.pbf

# dining places in SLO (restaurant, cafe, fast food, bar/pub, food court)
!osmium tags-filter data/slo_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine -o data/slo_dining.osm.pbf

# food places in SLO (dining and food shops)
!osmium tags-filter data/slo_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine n/shop=alcohol n/shop=bakery n/beverages n/shop=brewing_supplies n/shop=butcher n/shop=cheese n/shop=confectionery n/shop=convenience n/shop=dairy n/shop=farm n/shop=food n/shop=frozen_food n/shop=greengrocer n/shop=health_food n/shop=health_food n/shop=ice_cream n/shop=nuts n/shop=pasta n/shop=pastry n/shop=seafood n/shop=spices n/shop=tea n/shop=tortilla n/shop=water n/shop=wine -o data/slo_food.osm.pbf



In [4]:
# filters `north-america-latest.osm.pbf` file for: (Davis specific)

# entirety of Davis (bounded box around Davis)
!osmium extract -b "-121.85,38.45,-121.65,38.65" data/north-america-latest.osm.pbf -o data/davis_places.osm.pbf

# dining places in Davis (restaurant, cafe, fast food, bar/pub, food court)
!osmium tags-filter data/davis_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine -o data/davis_dining.osm.pbf

# food places in Davis (dining and food shops)
!osmium tags-filter data/davis_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine n/shop=alcohol n/shop=bakery n/beverages n/shop=brewing_supplies n/shop=butcher n/shop=cheese n/shop=confectionery n/shop=convenience n/shop=dairy n/shop=farm n/shop=food n/shop=frozen_food n/shop=greengrocer n/shop=health_food n/shop=health_food n/shop=ice_cream n/shop=nuts n/shop=pasta n/shop=pastry n/shop=seafood n/shop=spices n/shop=tea n/shop=tortilla n/shop=water n/shop=wine -o data/davis_food.osm.pbf



In [5]:
# filters `north-america-latest.osm.pbf` file for: (Athens, GA specific if have time UGA)

# entirety of Athens (bounded box around Athens)
!osmium extract -b "-83.5,33.88,-83.3,34.05" data/north-america-latest.osm.pbf -o data/athens_places.osm.pbf

# dining places in Athens (restaurant, cafe, fast food, bar/pub, food court)
!osmium tags-filter data/athens_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine -o data/athens_dining.osm.pbf

# food places in Athens (dining and food shops)
!osmium tags-filter data/athens_places.osm.pbf n/amenity=bar n/amenity=biergarten n/amenity=cafe n/amenity=fast_food n/amenity=food_court n/amenity=ice_cream n/amenity=pub n/amenity=restaurant n/amenity=bbq n/amenity=vending_machine n/shop=alcohol n/shop=bakery n/beverages n/shop=brewing_supplies n/shop=butcher n/shop=cheese n/shop=confectionery n/shop=convenience n/shop=dairy n/shop=farm n/shop=food n/shop=frozen_food n/shop=greengrocer n/shop=health_food n/shop=health_food n/shop=ice_cream n/shop=nuts n/shop=pasta n/shop=pastry n/shop=seafood n/shop=spices n/shop=tea n/shop=tortilla n/shop=water n/shop=wine -o data/athens_food.osm.pbf



In [6]:
# convert pbf to osm

!osmium cat data/slo_places.osm.pbf -o data/slo_places.osm
!osmium cat data/slo_dining.osm.pbf -o data/slo_dining.osm
!osmium cat data/slo_food.osm.pbf -o data/slo_food.osm

!osmium cat data/davis_places.osm.pbf -o data/davis_places.osm
!osmium cat data/davis_dining.osm.pbf -o data/davis_dining.osm
!osmium cat data/davis_food.osm.pbf -o data/davis_food.osm

!osmium cat data/athens_places.osm.pbf -o data/athens_places.osm
!osmium cat data/athens_dining.osm.pbf -o data/athens_dining.osm
!osmium cat data/athens_food.osm.pbf -o data/athens_food.osm



## Using XML parser to parse dataset into Parquet file

- use xml parser
- script to parse 'xml formatted' file
    - extract only nodes
        - extract timestamp (?)
        - extract lat & lon
    - save tags (just option to have these, can be null if not)
        - have to have `name`
        - have to have food-related `amenity` or food-related `shop` or food-related `highway`
            - `amenity `: bar, biergarten, cafe, fast_food, food_court, ice_cream, pub, restaurant
            - `amenity`: bbq (built for grilling in public)
            - `amentity`: vending_machine
            - `shop`: alcohol, bakery, beverages, breweing_supplies, butcher, cheese, chocolate, coffee, confectionery, convenience, dairy, farm, food, frozen_food, greengrocer, health_food, ice_cream, nuts, pasta, pastry, seafood, spices, tea, tortilla, water, wine
            - `highway`: services

        - doesn't have to have `cuisine` (but save in parquet if there is)
        - doesn't have to have `brand` (but save to parquet if there is)

In [9]:
# script to parse xml

def extract_osm_nodes(osm_file, tags_extract):

    context = ET.iterparse(osm_file, events=['start', 'end'])  # initialize xml parser
    nodes = []  # store all nodes with tags
    node_data = None  # initialize node_data outside of the loop

    for event, elem in context:
        
        # process nodes
        if event == 'start' and elem.tag == 'node':
            node_data = {
                'id': elem.get('id'),
                'timestamp': pd.to_datetime(elem.get('timestamp')), # convert to datetime object
                'lat': float(elem.get('lat')),
                'lon': float(elem.get('lon')) # convert to float (numerical data type will give issues for computation)
            }
            # print(f"Processing node {node_data['id']}")  # debug print
        
        # process tags within each node
        if event == 'start' and elem.tag == 'tag':
            tag_key = elem.get('k')
            tag_value = elem.get('v')
            # print(f"tag_key: {tag_key}, tag_value: {tag_value}") # debug

            if tag_key in tags_extract: # only store specific tag
                node_data[tag_key] = tag_value
                # print(f"Added {tag_key}: {node_data[tag_key]}")  # debug print

        if event == 'end' and elem.tag == 'node' and node_data: # append when node fully processed
            nodes.append(node_data)
            node_data = None  # reset for next node

        elem.clear()  # save memory

    return nodes

In [10]:
# repeat ^^ with slo, davis, athens

tags_extract = ['name', 'amenity', 'shop', 'cuisine', 'brand']

osm_files = [
    'data/slo_food.osm', 'data/slo_dining.osm', 'data/davis_food.osm', 'data/davis_dining.osm', 'data/athens_food.osm', 'data/athens_dining.osm'
]

for osm_file in osm_files:
    nodes = extract_osm_nodes(osm_file, tags_extract)

    osm_df = pd.DataFrame(nodes)
    parquet_file = osm_file.replace('data/', '').replace('.osm', '.parquet')
    osm_df.to_parquet(parquet_file, engine="pyarrow", index=False)