# Save subset of NWM in Zarr and Parquet format

This saves a subset of the NWM CHRTOUT dataset in Zarr and Parquet format. The data is chunked along the `feature_id` dimension in order to optimized for querying a subset of the reaches. This can be used to support benchmarks that compare the speed of queries using this different formats.

# Setup

In [27]:
import json
from os.path import join
from os import makedirs
import os
import shutil

import dask.dataframe as dd
import psycopg2
import shapely
import shapely.wkt
import geopandas as gpd
import xarray as xr
import fsspec
import numpy as np
import pyproj
import pandas as pd

%matplotlib inline

In [28]:
def get_cursor(database):
    connection = psycopg2.connect(host="noaa-db", database=database,user="postgres", password="mysecretpassword")
    cursor = connection.cursor()
    return cursor

# Make sample of dataset

To speed up downstream processes, we create a small subset of NWM CHRTOUT that only covers the reaches in a portion of the country that is around Philly.

In [29]:
# get all reaches in local sample of nhdplusv2 
cursor = get_cursor('nhdplusv2')
query = f'''
    SELECT comid from nhdflowline
    '''
cursor.execute(query)
reach_ids = [int(x[0]) for x in cursor]
print(f'There are {len(reach_ids)} reach ids in the local copy of NHDPlusV2.')

There are 139808 reach ids in the local copy of NHDPlusV2.


In [36]:
# Load the CHRTOUT data from the NWM Retrospective Zarr 2.1 dataset
# This has "Streamflow values at points associated with flow lines" 
# See https://registry.opendata.aws/nwm-archive/
nwm_uri = 's3://noaa-nwm-retrospective-2-1-zarr-pds/chrtout.zarr'
ds = xr.open_zarr(fsspec.get_mapper(nwm_uri, anon=True, requester_pays=True))
ds

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray
"Array Chunk Bytes 39.72 MiB 39.72 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type |S15 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,39.72 MiB,39.72 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,|S15,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.59 MiB 10.59 MiB Shape (2776738,) (2776738,) Count 2 Tasks 1 Chunks Type int32 numpy.ndarray",2776738  1,

Unnamed: 0,Array,Chunk
Bytes,10.59 MiB,10.59 MiB
Shape,"(2776738,)","(2776738,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.42 TiB 153.81 MiB Shape (367439, 2776738) (672, 30000) Count 50872 Tasks 50871 Chunks Type float64 numpy.ndarray",2776738  367439,

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.42 TiB 153.81 MiB Shape (367439, 2776738) (672, 30000) Count 50872 Tasks 50871 Chunks Type float64 numpy.ndarray",2776738  367439,

Unnamed: 0,Array,Chunk
Bytes,7.42 TiB,153.81 MiB
Shape,"(367439, 2776738)","(672, 30000)"
Count,50872 Tasks,50871 Chunks
Type,float64,numpy.ndarray


In [31]:
# Get a sample of the dataset that covers the reach_ids in the local DB, and a single day.
# If a whole month is used, the notebook will die.
avail_reach_ids = list(set(ds.feature_id.values).intersection(set(reach_ids)))
print(
    f'There are {len(reach_ids)} reaches in the DB and {len(avail_reach_ids)} of those are in NWM.')
avail_reach_ids = [int(x) for x in avail_reach_ids]
avail_reach_ids.sort()

sub_ds = ds.sel(feature_id=avail_reach_ids, time='1979-02-01')

orig_gb = (4 * ds.streamflow.size) / (10 ** 9)
sub_gb = (4 * sub_ds.streamflow.size) / (10 ** 9)

print(f'Original dataset is {orig_gb:.2f} gb')
print(f'Sample dataset is {sub_gb:.2f} gb')

sub_ds

There are 139808 reaches in the DB and 125391 of those are in NWM.
Original dataset is 4081.13 gb
Sample dataset is 0.01 gb


Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 489.81 kiB 489.81 kiB Shape (125391,) (125391,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",125391  1,

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.79 MiB,1.79 MiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,|S15,numpy.ndarray
"Array Chunk Bytes 1.79 MiB 1.79 MiB Shape (125391,) (125391,) Count 3 Tasks 1 Chunks Type |S15 numpy.ndarray",125391  1,

Unnamed: 0,Array,Chunk
Bytes,1.79 MiB,1.79 MiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,|S15,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 489.81 kiB 489.81 kiB Shape (125391,) (125391,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",125391  1,

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 489.81 kiB 489.81 kiB Shape (125391,) (125391,) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",125391  1,

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 489.81 kiB 489.81 kiB Shape (125391,) (125391,) Count 3 Tasks 1 Chunks Type int32 numpy.ndarray",125391  1,

Unnamed: 0,Array,Chunk
Bytes,489.81 kiB,489.81 kiB
Shape,"(125391,)","(125391,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,22.00 MiB,4.24 MiB
Shape,"(23, 125391)","(23, 24143)"
Count,50991 Tasks,26 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 22.00 MiB 4.24 MiB Shape (23, 125391) (23, 24143) Count 50991 Tasks 26 Chunks Type float64 numpy.ndarray",125391  23,

Unnamed: 0,Array,Chunk
Bytes,22.00 MiB,4.24 MiB
Shape,"(23, 125391)","(23, 24143)"
Count,50991 Tasks,26 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,22.00 MiB,4.24 MiB
Shape,"(23, 125391)","(23, 24143)"
Count,50991 Tasks,26 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 22.00 MiB 4.24 MiB Shape (23, 125391) (23, 24143) Count 50991 Tasks 26 Chunks Type float64 numpy.ndarray",125391  23,

Unnamed: 0,Array,Chunk
Bytes,22.00 MiB,4.24 MiB
Shape,"(23, 125391)","(23, 24143)"
Count,50991 Tasks,26 Chunks
Type,float64,numpy.ndarray


# Save sample of dataset in Zarr and Parquet format

In [6]:
reformat_dir = '/opt/data/noaa/reformat-sample/'
parq_path = join(reformat_dir, 'streamflow-parquet')
zarr_path = join(reformat_dir, 'streamflow-zarr')

In [11]:
%%time

# save sample in parquet format
# why does it take 8 mins??
if os.path.isdir(parq_path):
    shutil.rmtree(parq_path)
makedirs(parq_path)

# Why do I need to call unify_chunks?
df = sub_ds.unify_chunks().to_dask_dataframe()
df = df.set_index('feature_id')
df.to_parquet(parq_path)

CPU times: user 1min 19s, sys: 44.3 s, total: 2min 3s
Wall time: 8min 30s


(None,)

In [26]:
%%time

# save sample in zarr format
if os.path.isdir(zarr_path):
    shutil.rmtree(zarr_path)
makedirs(zarr_path)
# TODO not sure why I need the next line or set safe_chunks=False
sub_ds = sub_ds.unify_chunks().chunk()
sub_ds.to_zarr(zarr_path, safe_chunks=False)

CPU times: user 196 ms, sys: 85.8 ms, total: 282 ms
Wall time: 664 ms


<xarray.backends.zarr.ZarrStore at 0x7f08628dac10>

In [40]:
# It's chunked across the feature_ids.
sub_ds.unify_chunks().chunksizes

Frozen({'feature_id': (445, 5348, 802, 2192, 3915, 11722, 12946, 3414, 7466, 13248, 6817, 24143, 2, 12126, 1156, 3632, 1017, 1517, 1666, 1633, 7, 508, 4965, 3148, 766, 790), 'time': (23,)})