# Save a file for each HUC with reach geoms and ids

To make things more cloud-friendly, we want to save a portion of the NHD database in a set of GeoJSON files which can be stored on S3. Each file will have the reach geometries and associated ids.

In [48]:
import json
import os

import psycopg2
import shapely
import shapely.wkt
import geopandas as gpd
import xarray as xr
import fsspec
import numpy as np
import pyproj
import dask.bag as db
from dask.distributed import Client

%matplotlib inline

In [2]:
def get_cursor(database):
    connection = psycopg2.connect(host="noaa-db", database=database,user="postgres", password="mysecretpassword")
    cursor = connection.cursor()
    return cursor

# Get all HUC12s and write a GeoJSON file for each one.

In [50]:
cursor = get_cursor('nhdplushr')
query = "SELECT huc12 from wbdhu12"
cursor.execute(query)
huc12s = [c[0] for c in cursor]

In [45]:
def save_huc_extract(huc12, out_dir):
    # get huc12 boundary
    cursor = get_cursor('nhdplushr')
    query = "SELECT wkb_geometry from wbdhu12 WHERE huc12=%s"
    cursor.execute(query, [huc12])
    huc_geom = shapely.wkb.loads(cursor.fetchone()[0].tobytes())
    
    # get reaches intersecting with huc boundary
    cursor = get_cursor('nhdplusv2')
    query = f'''
        SELECT comid, ST_Force2D(wkb_geometry) from nhdflowline WHERE ST_Intersects(
            ST_GeomFromWKB(wkb_geometry, 4326), ST_GeomFromGeoJSON(%s))
        '''
    huc_geom_str = json.dumps(shapely.geometry.mapping(huc_geom))
    cursor.execute(query, [huc_geom_str])
    reach_geoms = []
    reach_ids = []
    for reach_id, reach_geom in cursor:
        reach_ids.append(int(reach_id))
        reach_geoms.append(shapely.wkb.loads(reach_geom, hex=True))

    # make dataframe with comid and geometries and save to GeoJSON    
    df = gpd.GeoDataFrame({'comid': reach_ids + [0], 'geometry': reach_geoms + [huc_geom]})
    out_path = os.path.join(out_dir, f'{huc12}.json')
    df.to_file(out_path, driver='GeoJSON', index=False)

In [51]:
out_dir = '/opt/data/noaa/huc12-extracts/'
os.makedirs(out_dir, exist_ok=True)

limit = 10
npartitions = limit
# out_dir = 's3://research-lf-dev/noaa/huc12-extracts/'

# Parallelize across HUCs
client = Client()
huc_bag = db.from_sequence(huc12s[0:limit], npartitions=npartitions)
out = huc_bag.map(save_huc_extract, out_dir).compute()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34689 instead
