# Demo of archiving NWM predictions 

This notebook demonstrates how to download NWM predictions and append them to Zarr files, and could form the basis of an NWM archive service. Zarr works well for gridded data, but Parquet seems preferable for point-based data. However, it is unclear if we can append to Parquet. A future iteration of this may investigate how to append to Parquet.

In [46]:
from os.path import join, exists, basename
import tempfile
from urllib import request
from os import makedirs
import os
import shutil

from tqdm import tqdm
import numpy as np
import xarray as xr
import pandas as pd
import dask.dataframe as ddf

In [47]:
out_dir = '/opt/data/noaa/nwm-preds'
archive_dir = join(out_dir, 'archive')
tmp_dir = join(out_dir, 'tmp')
makedirs(archive_dir, exist_ok=True)
makedirs(tmp_dir, exist_ok=True)

In [4]:
from dask.distributed import Client
client = Client(n_workers=8)

distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-04x414_k', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-4haftelk', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-6rje9dof', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-7jmi76wt', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-809kio3s', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-80gt7t9s', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebooks/dask-worker-space/worker-8dmyjsdf', purging
distributed.diskutils - INFO - Found stale lock file and directory '/opt/src/notebo

In [5]:
def get_nwm_uri(date, data_type, cycle_runtime, forecast_hour):
    cycle_runtime = f'{cycle_runtime:02}'
    forecast_hour = f'{forecast_hour:03}'
    return (
        f'https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/nwm.{date}/short_range/'
        f'nwm.t{cycle_runtime}z.short_range.{data_type}.f{forecast_hour}.conus.nc')

## Download a subset of NWM predictions for today

For each day, there is a prediction file for each point within a 2D space with the following dimensions:
* `cycle_runtime`: a time when the predictions were generated; values are in [0-23]
* `forecast_hour`: how far into the future the predictions are, indexed by the hour offset from the `cycle_runtime`; values are in [1-18].

Here we download a 2x2 grid for testing purposes.

In [8]:
ts = pd.Timestamp.utcnow()
date = ts.strftime("%Y%m%d")
# The set of data_types includes ['channel_rt', 'land', 'reservoir', 'terrain_rt']
data_type = 'channel_rt'
download_hour_files = False

if download_hour_files:
    for cycle_runtime in [1, 2]:
        for forecast_hour in [1, 2]:
            nwm_uri = get_nwm_uri(date, data_type, cycle_runtime, forecast_hour)
            nwm_path = join(tmp_dir, basename(nwm_uri))
            print(f'Downloading {nwm_uri}')
            request.urlretrieve(nwm_uri, nwm_path)

In [6]:
sample_path = join(tmp_dir, 'nwm.t01z.short_range.channel_rt.f001.conus.nc')
ds = xr.open_dataset(sample_path)
ds

In [48]:
chunks = {'time': 1, 'feature_id': 100_000}
out_path = join(archive_dir, f'{data_type}.parquet')
out_index_path = join(archive_dir, f'{data_type}-index.parquet')

# Append forecast_hour to the temporary file.
# Don't append during the first iteration.
append = False
for cycle_runtime in tqdm([1, 2]):
    for forecast_hour in tqdm([1, 2]):
        nwm_uri = get_nwm_uri(date, data_type, cycle_runtime, forecast_hour)
        nwm_path = join(tmp_dir, basename(nwm_uri))

        with xr.open_dataset(nwm_path, chunks=chunks) as ds:
            ds = ds.drop('crs')
            # Replace with offset hour instead of absolute time.
            ds = ds.assign_coords(time=np.array([forecast_hour]))
            df = ds.to_dask_dataframe()
                
            # Note Parquet version of file is much larger than NetCDF (200 mb vs. 40 mb)
            df.to_parquet(out_path, append=append, ignore_divisions=True)

            df = df.set_index('feature_id')
            df.to_parquet(out_index_path, append=append, ignore_divisions=True)

            append = True

100%|██████████| 2/2 [00:26<00:00, 13.42s/it]
100%|██████████| 2/2 [00:29<00:00, 14.76s/it]
100%|██████████| 2/2 [00:56<00:00, 28.20s/it]


# Test reading under various conditions.

It takes about 25 secs for the first execution with or without an index.

In [49]:
# Test reading the output without an index.
df = ddf.read_parquet(out_path)
df = df[df['feature_id'] == 101]
df.compute()

Unnamed: 0,time,reference_time,feature_id,streamflow,nudge,velocity,qSfcLatRunoff,qBucket,qBtmVertRunoff
0,1,2022-03-21 01:00:00,101,0.12,0.0,0.07,0.0,0.00224,7.839
0,2,2022-03-21 01:00:00,101,0.12,0.0,0.07,0.0,0.00224,7.833
0,1,2022-03-21 02:00:00,101,0.12,0.0,0.07,0.0,0.00224,7.833
0,2,2022-03-21 02:00:00,101,0.12,0.0,0.07,0.0,0.00224,7.825


In [51]:
# Test reading the output with an index.
df = ddf.read_parquet(out_index_path)
df = df.loc[101]
df.compute()

Unnamed: 0_level_0,time,reference_time,streamflow,nudge,velocity,qSfcLatRunoff,qBucket,qBtmVertRunoff
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
101,1,2022-03-21 01:00:00,0.12,0.0,0.07,0.0,0.00224,7.839
101,2,2022-03-21 01:00:00,0.12,0.0,0.07,0.0,0.00224,7.833
101,1,2022-03-21 02:00:00,0.12,0.0,0.07,0.0,0.00224,7.833
101,2,2022-03-21 02:00:00,0.12,0.0,0.07,0.0,0.00224,7.825
