# OSM Data Conversion Approaches

In [9]:
from pyrosm import OSM # python library that reads and converts .osm.pbf files
import geopandas as gpd # like pandas but also geospatial data
import polars as pl
import folium # plot geo data

## Some documentation / exploring before cleaning:

In [10]:
# shows general metadata about file, including size, number of objects, and other useful information
!osmium fileinfo north-america-latest.osm.pbf

# - ~16.8 GB
# - bounding box represents the geographic bounding box for the data (min and max latitudes and longitudes for area covered in file)
#     - Longitude: -180° (west) to 180° (east)
#     - Latitude: ~5.57° to ~85.04° (north-south)

File:
  Name: north-america-latest.osm.pbf
  Format: PBF
  Compression: none
  Size: 16799084065
Header:
  Bounding boxes:
    (-180,5.57228,180,85.04177)
  With history: no
  Options:
    generator=osmium/1.14.0
    osmosis_replication_base_url=https://download.geofabrik.de/north-america-updates
    osmosis_replication_sequence_number=4339
    osmosis_replication_timestamp=2025-02-19T21:20:47Z
    pbf_dense_nodes=true
    pbf_optional_feature_0=Sort.Type_then_ID
    sorting=Type_then_ID
    timestamp=2025-02-19T21:20:47Z


In [11]:
# # FILTER `north-america-latest.osm.pbf` file for areas with:

# # landuse=residential tags
# !osmium tags-filter north-america-latest.osm.pbf n/landuse=residential -o residential_sample.osm.pbf

# # landuse=residential, landuse=commercial, landuse=industrial tags
# !osmium tags-filter final_analysis/north-america-latest.osm.pbf n/landuse=residential,n/landuse=commercial,n/landuse=industrial -o landuse_sample.osm.pbf

# # landuse=commercial, landuse=retail, landuse=business tags
# !osmium tags-filter final_analysis/north-america-latest.osm.pbf n/landuse=commercial n/landuse=retail n/landuse=business -o commercial_retail_business.osm.pbf

# # restaurant, cafe, or fast_food under the amenity tag
# !osmium tags-filter final_analysis/north-america-latest.osm.pbf n/amenity=restaurant n/amenity=cafe n/amenity=fast_food -o food_commercial.osm.pbf

In [12]:
# # convert to osm files for readability

# !osmium cat residential_sample.osm.pbf -o residential_sample.osm
# !osmium cat landuse_sample.osm.pbf -o landuse_sample.osm
# !osmium cat commercial_retail_business.osm.pbf -o commercial_retail_business.osm
# !osmium cat food_commercial.osm.pbf -o food_commercial.osm

probably change above ^^

In [13]:
# filters `north-america-latest.osm.pbf` file for: (SLO specific)

# entirety of SLO (bounded box around SLO) (Format is LONG1,LAT1,LONG2,LAT2)
!osmium extract -b "-120.7,34.9,-120.4,35.3" north-america-latest.osm.pbf -o slo_places.osm.pbf

# dining places in SLO (restaurant, cafe, fast food, bar/pub, food court)
!osmium tags-filter slo_places.osm.pbf n/amenity=restaurant n/amenity=cafe n/amenity=fast_food n/amenity=ice_cream n/amenity=bar n/amenity=pub n/amenity=food_court -o slo_dining.osm.pbf

# food places in SLO (dining and bakery/convenience store/market)
!osmium tags-filter slo_places.osm.pbf n/amenity=restaurant n/amenity=cafe n/amenity=fast_food n/amenity=ice_cream n/amenity=pub n/amenity=bar n/amenity=food_court n/shop=supermarket n/shop=bakery n/shop=convenience -o slo_food_places.osm.pbf




In [14]:
# # filters `north-america-latest.osm.pbf` file for: 

# # entirety of CA
# !osmium extract -b "-124.5,32.5,-113.5,42.0" north-america-latest.osm.pbf -o ca_places.osm.pbf
# !osmium tags-filter ca_places.osm.pbf n/amenity=restaurant n/amenity=cafe n/amenity=fast_food n/amenity=ice_cream n/amenity=pub n/amenity=bar n/amenity=food_court n/shop=supermarket n/shop=bakery n/shop=convenience -o ca_food_places.osm.pbf

In [15]:
# conversions

!osmium cat slo_places.osm.pbf -o slo_places.osm
!osmium cat slo_dining.osm.pbf -o slo_dining.osm
!osmium cat slo_food_places.osm.pbf -o slo_food_places.osm



## Data Conversion Approaches

### approach 1: convert pbf to parquet using json or csv in between

i need to convert to something more memory efficient: ??
1. convert pbf to json/csv 
2. convert json/csv to parquet

then, print out structure of parquet to look at

In [18]:
# convert slo -> csv -> parquet
!osmium export slo_places.osm -f text -o slo_places.csv # convert to csv

slo_df = pl.read_csv("slo_places.csv", truncate_ragged_lines=True)
# error: found more fields than defined in 'Schema' -- inconsistent row lengths, bad for csv tabular data

slo_df.write_parquet("slo_places.parquet")

In [19]:
print(slo_df.columns) # saves as 1 column :(

['POINT(-120.589124 35.067871) highway=turning_circle']


In [None]:
slo_parquet = pl.read_parquet("slo_places.parquet")
slo_parquet.head() # look at parquet, just 1 column :(

POINT(-120.589124 35.067871) highway=turning_circle
str
"""POINT(-120.648701 35.271668) h…"
"""POINT(-120.668631 35.284601) h…"
"""POINT(-120.66746 35.285105) di…"
"""POINT(-120.6165137 35.1293) hi…"
"""POINT(-120.5273139 35.0518052)…"


In [None]:
# convert slo -> json -> parquet
# !osmium export slo_places.osm -f text -o slo_places.csv # convert to csv

# slo_json = pl.read_json("slo_places.json") # broke
# slo_json.write_parquet("slo_places.parquet")

### approach 2: use osm-parqueter 
https://wiki.openstreetmap.org/wiki/Osm-parquetizer

- run given commands
- split into 3 parquets with node, way, relation

In [31]:
# use osm-parqueter to convert entire north-america?

!git clone https://github.com/adrianulbona/osm-parquetizer.git
%cd osm-parquetizer
!mvn package
!mvn clean package
!java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar /Users/bernettechan/Desktop/osm_northamerica/north-america-latest.osm.pbf

Cloning into 'osm-parquetizer'...
remote: Enumerating objects: 219, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 219 (delta 2), reused 15 (delta 0), pack-reused 199 (from 1)[K
Receiving objects: 100% (219/219), 80.63 KiB | 635.00 KiB/s, done.
Resolving deltas: 100% (70/70), done.
/Users/bernettechan/Desktop/osm_northamerica/osm-parquetizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m---------------< [0;36mio.github.adrianulbona:osm-parquetizer[0;1m >---------------[m
[[1;34mINFO[m] [1mBuilding OSM Parquetizer 1.0.1-SNAPSHOT[m
[[1;34mINFO[m]   from pom.xml
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[ERROR] 'dependencies.dependency.version' for jdk.tools:jdk.tools:jar is missing. @ 
[ERROR] 'dependencies.dependency.systemPath' for jdk.tools:jdk.tools:jar is missing. @ 

[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mresources:3.3.1:resources[m [1m(default-resources)[m @ [36mosm-parquetizer[0;1m ---[m
[[1;34mINFO[m] Copying 1 resource from src/main/resources to target/classes
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mcompiler:3.3:compile[m [1m(default-compile)[m @ [36mosm-parquetizer[0;1m ---[m
[[1;34mINFO[m] Changes detected - recompiling the module!
[[1;34mINFO[m] Compiling 8 source files to /Us

In [34]:
pl.read_parquet('/Users/bernettechan/Desktop/osm_northamerica/north-america-latest.osm.pbf.node.parquet')

# empty parquet file

ComputeError: parquet: File out of specification: A parquet file must contain a header and footer with at least 12 bytes

In [36]:
# use osm-parqueter to convert slo_places

# !git clone https://github.com/adrianulbona/osm-parquetizer.git
%cd osm-parquetizer
!mvn clean package
!java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar /Users/bernettechan/Desktop/osm_northamerica/slo_places.osm.pbf

[Errno 2] No such file or directory: 'osm-parquetizer'
/Users/bernettechan/Desktop/osm_northamerica/osm-parquetizer
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m---------------< [0;36mio.github.adrianulbona:osm-parquetizer[0;1m >---------------[m
[[1;34mINFO[m] [1mBuilding OSM Parquetizer 1.0.1-SNAPSHOT[m
[[1;34mINFO[m]   from pom.xml
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[ERROR] 'dependencies.dependency.version' for jdk.tools:jdk.tools:jar is missing. @ 
[ERROR] 'dependencies.dependency.systemPath' for jdk.tools:jdk.tools:jar is missing. @ 

[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mclean:3.2.0:clean[m [1m(default-clean)[m @ [36mosm-parquetizer[0;1m ---[m
[[1;34mINFO[m] Deleting /Users/bernettechan/Desktop/osm_northamerica/osm-parquetizer/target
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mresources:3.3.1:resources[m [1m(default-resources)[m @ [36mosm-parque

^^ osm-parqueter creates empty parquet files