# Demo of archiving NWM predictions 

This notebook demonstrates how to download NWM predictions and append them to Zarr files, and could form the basis of an NWM archive service. Zarr works well for gridded data, but Parquet seems preferable for point-based data. However, it is unclear if we can append to Parquet. A future iteration of this may investigate how to append to Parquet.

In [11]:
from os.path import join, exists, basename
import tempfile
from urllib import request
from os import makedirs
import os
import progressbar

import numpy as np
import xarray as xr
import pandas as pd

In [12]:
out_dir = '/opt/data/noaa/nwm-preds'
archive_dir = join(out_dir, 'archive')
tmp_dir = join(out_dir, 'tmp')
makedirs(archive_dir, exist_ok=True)
makedirs(tmp_dir, exist_ok=True)

In [13]:
# From https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
class MyProgressBar():
    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar=progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()

In [14]:
def get_nwm_uri(date, data_type, cycle_runtime, forecast_hour):
    cycle_runtime = f'{cycle_runtime:02}'
    forecast_hour = f'{forecast_hour:03}'
    return (
        f'https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/nwm.{date}/short_range/'
        f'nwm.t{cycle_runtime}z.short_range.{data_type}.f{forecast_hour}.conus.nc')

def cache_nwm_file(date, data_type, cycle_runtime, forecast_hour, 
                   tmp_dir, archive_dir, append=True):
    """Download an NWM file and append to a Zarr file."""
    nwm_uri = get_nwm_uri(date, data_type, cycle_runtime, forecast_hour)
    nwm_path = join(tmp_dir, basename(nwm_uri))
    request.urlretrieve(nwm_uri, nwm_path, MyProgressBar())
    
    # The netcdf and zarr files around both ~50mb. but the RAM usage of the notebook goes from 
    # 1.5 to 9 GB when this is running. Occasionally, the notebook kernel dies. 
    ds = xr.open_dataset(nwm_path)
    out_path = join(archive_dir, f'{data_type}.zarr')

    append_dim = None
    if append: 
        # remove CRS since it is always the same and we can't append to it 
        # since it doesn't have a time dimension.
        ds = ds.drop('crs')
        append_dim = 'time'

    ds.to_zarr(out_path, append_dim=append_dim)
    os.remove(nwm_path)

## Download predictions and archive three time steps of the current day's predictions for terrain_rt. Just opening the dataset lazily over http does not work if you use to_zarr. 

In [15]:
ts = pd.Timestamp.utcnow()
date = ts.strftime("%Y%m%d")
# data_types = ['channel_rt', 'land', 'reservoir', 'terrain_rt']
data_type = 'terrain_rt'
# model cycle runtime (0-23)
cycle_runtime = 0
# forecast hour (1-18)
forecast_hour = 1

In [16]:
for forecast_hour in [1, 2, 3]:
    # Don't append on the first iteration since there's nothing there yet to
    # be appended.
    append = forecast_hour != 1
    cache_nwm_file(date, data_type, cycle_runtime, forecast_hour, tmp_dir, archive_dir, append=append)

100% (39636182 of 39636182) |############| Elapsed Time: 0:00:07 Time:  0:00:07
100% (39605767 of 39605767) |############| Elapsed Time: 0:00:07 Time:  0:00:07
100% (39575593 of 39575593) |############| Elapsed Time: 0:00:10 Time:  0:00:10


## Open the Zarr file that was written and examine it. Note the time dimension is length 3.

In [17]:
zarr_path = join(archive_dir, f'{data_type}.zarr')
ds = xr.open_zarr(zarr_path)
ds

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.33 GiB 4.22 MiB Shape (3, 15360, 18432) (1, 480, 1152) Count 1537 Tasks 1536 Chunks Type float64 numpy.ndarray",18432  15360  3,

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.33 GiB 4.22 MiB Shape (3, 15360, 18432) (1, 480, 1152) Count 1537 Tasks 1536 Chunks Type float64 numpy.ndarray",18432  15360  3,

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray


In [18]:
ds.zwattablrt

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.33 GiB 4.22 MiB Shape (3, 15360, 18432) (1, 480, 1152) Count 1537 Tasks 1536 Chunks Type float64 numpy.ndarray",18432  15360  3,

Unnamed: 0,Array,Chunk
Bytes,6.33 GiB,4.22 MiB
Shape,"(3, 15360, 18432)","(1, 480, 1152)"
Count,1537 Tasks,1536 Chunks
Type,float64,numpy.ndarray
