# HUC8 Streamflow Aggregation Query

This notebook shows how to get a HUC8 by id, query NHD for all reaches within the HUC, and then query the NWM reanalysis dataset to get aggregated streamflow values.

Note: this is a work in progress. See TODOs.

# Setup

In [47]:
import json
from os.path import join

import psycopg2
import shapely
import shapely.wkt
import geopandas as gpd
import xarray as xr
import fsspec
import numpy as np
import pyproj
from dask.distributed import Client
import numpy as np
import pandas as pd

%matplotlib inline

In [4]:
client = Client(n_workers=8)

2022-06-27 16:37:16,383 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-2rllskz2', purging
2022-06-27 16:37:16,400 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-7ab3y1_b', purging
2022-06-27 16:37:16,412 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-cgu4aad0', purging
2022-06-27 16:37:16,425 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-e65jk5tw', purging
2022-06-27 16:37:16,438 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-hmki76hz', purging
2022-06-27 16:37:16,451 - distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-ifu_yjyy', purging
2022-06-27 16:37:16,464 - distributed.diskutils - INFO - F

In [13]:
# COMIDs for a HUC12 in West Philly.
reach_ids= [
    4494440,
    4494462,
    4494500,
    4494526,
    4495680,
    4489136,
    4495656,
    4496602,
    4489138,
    4494480
]

In [45]:
! pip install smart_open

Collecting smart_open
  Downloading smart_open-6.0.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: smart_open
Successfully installed smart_open-6.0.0
[0m

In [54]:
# Get COMIDs for a HUC8 around Philly from a HUC8 extract on S3.
from smart_open import open

philly_huc8 = '02040202'
huc8_root_uri = 's3://azavea-noaa-hydro-data/noaa/huc8-extracts/transformed/'
huc8_uri = join(huc8_root_uri, f'{philly_huc8}.json')

with open(huc8_uri) as fd:
    huc8_dict = json.loads(fd.read())
    reach_ids = huc8_dict['features'][0]['properties']['comids']

# Query NWM by reach ids

In [66]:
# Load the CHRTOUT data from the NWM Retrospective Zarr 2.1 dataset
# This has "Streamflow values at points associated with flow lines" 
# See https://registry.opendata.aws/nwm-archive/
nwm_uri = 's3://noaa-nwm-retrospective-2-1-zarr-pds/chrtout.zarr'
ds = xr.open_zarr(fsspec.get_mapper(nwm_uri, anon=True, requester_pays=True))
ds

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray
"Array Chunk Bytes 39.72 MiB 39.72 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type |S15 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type int32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.42 TiB 153.81 MiB Shape (367439, 2776738) (672, 30000) Count 50872 Tasks 50871 Chunks Type float64 numpy.ndarray",2776738  367439,

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.42 TiB 153.81 MiB Shape (367439, 2776738) (672, 30000) Count 50872 Tasks 50871 Chunks Type float64 numpy.ndarray",2776738  367439,

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray


In [57]:
# Note that the chunks are 2D as opposed to whole rows or whole columns.
ds.streamflow

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.42 TiB 153.81 MiB Shape (367439, 2776738) (672, 30000) Count 50872 Tasks 50871 Chunks Type float64 numpy.ndarray",2776738  367439,

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray
"Array Chunk Bytes 39.72 MiB 39.72 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type |S15 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type int32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray


In [16]:
# Queries can be seen as lying on a spectrum that ranges from "a single time step and all features" to "all time steps and one feature."
# This is to get an idea how the current chunking pattern supports these extremes.
shape = np.array(ds.streamflow.shape)
chunk_sizes = np.array([ds.streamflow.chunks[0][0], ds.streamflow.chunks[1][0]])
nb_chunks = shape // chunk_sizes
gb_per_chunk = (chunk_sizes.prod() * 8) / (10 ** 9)

print(f'The number of chunks for each dimension: {nb_chunks}')
print(f'For a whole time series and single feature: {nb_chunks[0]} chunks, and {nb_chunks[0] * gb_per_chunk :.1f} GB')
print(f'For a single time step and all features: {nb_chunks[1]} chunks, and {nb_chunks[1] * gb_per_chunk :.1f} GB')

The number of chunks for each dimension: [546  92]
For a whole time series and single feature: 546 chunks, and 88.1 GB
For a single time step and all features: 92 chunks, and 14.8 GB


In [58]:
# Apparently, only some of the reach ids in NHDPlus V2 are available in NWM.
# Question: why is that?
avail_reach_ids = list(set(ds.feature_id.values).intersection(set(reach_ids)))
print(
    f'There are {len(reach_ids)} reaches in the HUC and {len(avail_reach_ids)} of those are in NWM.')

There are 2153 reaches in the HUC and 1787 of those are in NWM.


In [59]:
# Lazily compute the subset of the dataset that contains a certain time range.
time_range = slice('1980-01-01', '1980-01-03')
sub_ds = ds.sel(feature_id=avail_reach_ids, time=time_range)
sub_ds

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.98 kiB 6.98 kiB Shape (1787,) (1787,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",1787  1,

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,26.18 kiB,26.18 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,|S15,numpy.ndarray
"Array Chunk Bytes 26.18 kiB 26.18 kiB Shape (1787,) (1787,) Count 3 Tasks 1 Chunks Type |S15 numpy.ndarray",1787  1,

Unnamed: 0,Array,Chunk
Bytes,26.18 kiB,26.18 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,|S15,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.98 kiB 6.98 kiB Shape (1787,) (1787,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",1787  1,

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.98 kiB 6.98 kiB Shape (1787,) (1787,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",1787  1,

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 6.98 kiB 6.98 kiB Shape (1787,) (1787,) Count 3 Tasks 1 Chunks Type int32 numpy.ndarray",1787  1,

Unnamed: 0,Array,Chunk
Bytes,6.98 kiB,6.98 kiB
Shape,"(1787,)","(1787,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.98 MiB,192.17 kiB
Shape,"(72, 1787)","(49, 502)"
Count,51246 Tasks,188 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 0.98 MiB 192.17 kiB Shape (72, 1787) (49, 502) Count 51246 Tasks 188 Chunks Type float64 numpy.ndarray",1787  72,

Unnamed: 0,Array,Chunk
Bytes,0.98 MiB,192.17 kiB
Shape,"(72, 1787)","(49, 502)"
Count,51246 Tasks,188 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.98 MiB,192.17 kiB
Shape,"(72, 1787)","(49, 502)"
Count,51246 Tasks,188 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 0.98 MiB 192.17 kiB Shape (72, 1787) (49, 502) Count 51246 Tasks 188 Chunks Type float64 numpy.ndarray",1787  72,

Unnamed: 0,Array,Chunk
Bytes,0.98 MiB,192.17 kiB
Shape,"(72, 1787)","(49, 502)"
Count,51246 Tasks,188 Chunks
Type,float64,numpy.ndarray


In [62]:
import time
import tqdm

def time_func(func, repeats=1):
    times = []
    for _ in tqdm.tqdm(range(repeats), leave=False, desc='repeat'):
        start_time = time.time()
        func()
        elapsed = time.time() - start_time
        times.append(elapsed)
    return np.array(times)

In [63]:
repeats = 1
query_map = {
    'mean_features_mean_day': (lambda: sub_ds.streamflow.mean(dim='feature_id').groupby('time.dayofyear').mean().values),
    'mean_day': (lambda: sub_ds.streamflow.groupby('time.dayofyear').mean().values),
    'mean_week': (lambda: sub_ds.streamflow.groupby('time.weekofyear').mean().values)
}

query = []
time_mean = []
time_std = []
for qname, qfunc in tqdm.tqdm(query_map.items(), desc='query'):
    times = time_func(qfunc, repeats=repeats)
    query.append(qname)
    time_mean.append(times.mean())
    time_std.append(times.std())
df = pd.DataFrame(data={'query': query, 'time_mean': time_mean, 'time_std': time_std})

query: 100%|██████████| 3/3 [02:32<00:00, 50.87s/it]


In [64]:
nb_reaches = len(avail_reach_ids)
nb_days = (pd.to_datetime(time_range.stop) - pd.to_datetime(time_range.start)).days
format = 'zarr'
time_chunk_sz = chunk_sizes[0]
feature_id_chunk_sz = chunk_sizes[1]

df['nb_reaches'] = nb_reaches
df['nb_days'] = nb_days
df['nb_repeats'] = repeats
df['format'] = format
df['time_chunk_sz'] = time_chunk_sz
df['feature_id_chunk_sz'] = feature_id_chunk_sz

In [65]:
df

Unnamed: 0,query,time_mean,time_std,nb_reaches,nb_days,nb_repeats,format,time_chunk_sz,feature_id_chunk_sz
0,mean_features_mean_day,48.186932,0.0,1787,2,1,zarr,672,30000
1,mean_day,52.038361,0.0,1787,2,1,zarr,672,30000
2,mean_week,51.890872,0.0,1787,2,1,zarr,672,30000


In [67]:
benchmark_root_uri = 's3://azavea-noaa-hydro-data/esip-experiments/benchmarks/'
benchmark_uri = join(benchmark_root_uri, 'test-06-27-2021a.csv')
df.to_csv(benchmark_uri)