# Build UHE-daily Zarr
Inspired by this [great blog and repo](https://earthmover.io/blog/serverless-datacube-pipeline/#What-Could-be-Improved).

## How this works
The input data is about > 12k tifs. Instead of using Xarray's open_mfdataset + rasterio, this approach creates a Zarr store by:
1. Create a 'skeleton' or 'template' of what the structure of the ending Zarr store should look like and write it to cloud storage.
2. Insert slices into the template to fill out the Zarr store. This was done via coiled functions. 

In [1]:
import warnings

import coiled
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm

## Functions to create input dataset urls and parse Xarray datasets

In [2]:
def gen_wget_strings(start_year: int, end_year: int) -> list:
    """
    Access the UHE-daily gridded data product that formed the basis for
    Tuholske et al (2021).
    """
    daterange = pd.date_range(f"{start_year}-01-01", f"{end_year}-12-31")
    return [
        f"https://data.chc.ucsb.edu/people/cascade/UHE-daily/wbgtmax/{date.strftime('%Y')}/wbgtmax.{date.strftime('%Y.%m.%d')}.tif"
        for date in daterange
    ]


def parse_ds(ds: xr.Dataset) -> xr.Dataset:
    warnings.simplefilter("ignore")

    ds = (
        ds.expand_dims(
            time=[np.datetime64("-".join(ds.encoding["source"].split(".")[-4:-1]))]
        )
        .squeeze(dim=["band"], drop=True)
        .drop("spatial_ref")
        .rename({"band_data": "WBGT", "x": "lon", "y": "lat"})
        .sortby("lat")
    )
    ds = ds.chunk({"time": 1, "lat": 2600, "lon": 7200})
    return ds

## Open a single dataset and use it to create the template Zarr

In [None]:
start_year = 1983
end_year = 2016
uhe_urls = gen_wget_strings(start_year, end_year)
first_url = uhe_urls[0]

ds = xr.open_dataset(first_url, engine="rasterio", chunks={})
ds = parse_ds(ds)


daterange = pd.date_range(f"{start_year}-01-01", f"{end_year}-12-31")

template = (
    ds.pipe(xr.zeros_like)
    .isel(time=0, drop=True)
    .expand_dims(time=daterange)
    .chunk({"time": 1, "lat": 2600, "lon": 7200})
)
template.to_zarr(
    "s3://carbonplan-climate-impacts/extreme-heat-extension/v1.0/inputs/uhe_daily.zarr",
    zarr_format=3,
    consolidated=False,
    compute=False,
    mode="w",
)

## Fill the Zarr template with slices

In [None]:
@coiled.function(vm_type="m7g.medium", region="us-west-2", n_workers=1)
def delayed_write_region(url: str):
    ds = xr.open_dataset(url, engine="rasterio")
    ds = parse_ds(ds)
    ds.to_zarr(
        "s3://carbonplan-climate-impacts/extreme-heat-extension/v1.0/inputs/uhe_daily.zarr",
        zarr_format=3,
        region="auto",
        compute=True,
        consolidated=False,
    )
    return ("success", url)


def run_coiled() -> list:
    results = list(
        tqdm(
            delayed_write_region.map(uhe_urls, retries=5),
            total=len(uhe_urls),
            desc="Jobs Completed",
        )
    )

    return results


results_list = run_coiled()

## Check the final Zarr store

In [None]:
mds = xr.open_zarr(
    "s3://carbonplan-climate-impacts/extreme-heat-extension/v1.0/inputs/uhe_daily.zarr",
    chunks={},
)
mds