
# Export Building Footprints to GeoParquet and Shapefile

This notebook exports building footprints data from a Databricks silver table to:
* **GeoParquet** format (for efficient geospatial data storage)
* **Zipped Shapefile** format (for compatibility with GIS tools like Esri)

Data is partitioned by `county_fips` and written to a Unity Catalog volume.

## Parameters
* **source_table**: Fully qualified table name (catalog.schema.table)
* **volume_path**: Unity Catalog volume path (e.g., `/Volumes/catalog/schema/volume_name`)

In [0]:
# Install required packages
# Note specific package versions can be pinned for reporoducability
# All packages already exist in the serverless runtime
# Additional packages are installed here for illustration
%pip install geopandas pyarrow shapely --quiet
dbutils.library.restartPython()

In [0]:
import os
import geopandas as gpd
import pandas as pd
from pyspark.sql import functions as F
from shapely import wkb
import zipfile
import tempfile
import shutil

In [0]:
# Get parameters
#dbutils.widgets.text("source_table", "odi_datalake.odi_silver.building_footprints_with_blocks_and_places_")
#dbutils.widgets.text("volume_path", "/Volumes/odi_datalake/odi_gold/global_ml_building_footprints")
year = dbutils.widgets.get("tiger_year")
source_table = dbutils.widgets.get("source_table") + year
volume_path = dbutils.widgets.get("volume_path")


# Create subdirectories for parquet and shapefile outputs
parquet_path = f"{volume_path}/parquet/{year}"
shp_path = f"{volume_path}/shp/{year}"

print(f"Source table: {source_table}")
print(f"Output paths:")
print(f"  - GeoParquet: {parquet_path}")
print(f"  - Shapefiles: {shp_path}")

In [0]:
# Read all data for the specified year (single query instead of 57+ queries)
print(f"Loading data for year {year}...")
all_data_df = spark.table(source_table)

# Get distinct counties from the data
counties_df = all_data_df \
    .select("county_fips") \
    .distinct() \
    .filter(F.col("county_fips").isNotNull()) \
    .orderBy("county_fips")

counties = [row.county_fips for row in counties_df.collect()]
print(f"Found {len(counties)} counties to process for year {year}")
print(f"Counties: {counties[:10]}{'...' if len(counties) > 10 else ''}")

In [0]:
# Clear existing files in output directories to avoid accumulation from previous runs
print(f"Cleaning output directories...")

try:
    # Remove and recreate parquet directory
    dbutils.fs.rm(parquet_path, recurse=True)
    print(f"  ✓ Cleared {parquet_path}")
except:
    print(f"  ℹ {parquet_path} doesn't exist yet")

try:
    # Remove and recreate shapefile directory
    dbutils.fs.rm(shp_path, recurse=True)
    print(f"  ✓ Cleared {shp_path}")
except:
    print(f"  ℹ {shp_path} doesn't exist yet")

print("Ready to process counties.")

In [0]:
# Create output directories if they don't exist
dbutils.fs.mkdirs(parquet_path)
dbutils.fs.mkdirs(shp_path)

# Process each county and write to GeoParquet and Shapefile
for index, county in enumerate(counties):
    print(f"Processing county_fips {county} ({index + 1}/{len(counties)})...")
    
    # Filter cached data for this county (no additional table query needed)
    county_df = all_data_df.filter(F.col("county_fips") == county)
    
    # Convert to Pandas DataFrame
    pdf = county_df.toPandas()
    
    # Convert geometry column from WKB to shapely geometries using vectorized operation
    # Extract WKB bytes from Databricks geometry type, then use shapely's from_wkb
    pdf['geometry'] = gpd.GeoSeries.from_wkb(pdf['geometry'].apply(lambda x: x.wkb if x is not None else None))
    
    # Create GeoDataFrame with EPSG:4326 (WGS84) coordinate reference system
    gdf = gpd.GeoDataFrame(pdf, geometry='geometry', crs='EPSG:4326')
    
    # Filter out GeometryCollection types (as in original code)
    gdf = gdf[gdf.geometry.geom_type != 'GeometryCollection']
    
    # Define file prefix
    file_prefix = f"county_fips_{county}"
    
    # Write to GeoParquet (overwrites by default)
    parquet_file = f"{parquet_path}/{file_prefix}.parquet"
    gdf.to_parquet(parquet_file)
    print(f"  ✓ Wrote {len(gdf)} records to {parquet_file}")
    
    # Write to zipped shapefile
    # .shz suffix triggers GDAL to write zipped shapefile automatically
    temp_shz = f"/tmp/{file_prefix}.shz"
    gdf.to_file(temp_shz)
    
    # Copy to volume with .zip extension (for Esri compatibility)
    shp_file = f"{shp_path}/{file_prefix}.zip"
    
    # Use shutil.copy to copy the binary file directly
    shutil.copy(temp_shz, shp_file)
    
    # Clean up temp file
    os.remove(temp_shz)
    print(f"  ✓ Wrote shapefile to {shp_file}")

print(f"\n✅ Processing complete! Processed {len(counties)} counties for year {year}.")