In [1]:
import logging
import os
import pdb
import warnings
from pathlib import Path

import metpy.constants
import numpy as np
import pandas as pd
import seaborn as sns
import xarray
from hwt import firstRun, fv3, helicityThresholds, mpas, windThresholds
from matplotlib.colors import ListedColormap
from metpy.units import units
from sklearn.neighbors import BallTree
from tqdm import tqdm

sns.set_theme()
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s", force=True)
tmpdir = Path(os.getenv("TMPDIR"))

In [2]:
def assigncoords(ds: xarray.Dataset):
    """
    Assign member number, dayForecast, forecast hour,
    initialization time, and valid time to xarray Dataset.
    - Read member number from substring component of file path.
    - Get forecast hour from global attribute (.attrs["forecastHour"])
    - Initialization time comes from global attribute `initializationTime`.
    - Valid time derived from initialization time and forecast hour.
    """

    # get member number from original name of file, which is
    # held in Dataset.encoding["source"].
    filename = Path(ds.encoding["source"])
    # grab part that starts with "mem"
    mem = [p for p in filename.parts if p.startswith("mem")]
    # strip off the "mem_" part and keep the reset
    mem = mem[0].lstrip("mem_")
    # convert to integer
    mem = int(mem)

    # read forecastHour from global attribute
    forecastHour = ds.attrs["forecastHour"]
    forecastHour = float(forecastHour)

    # assign dayForecast (after forecastHour is defined)
    # add 11 instead of 12 because we want forecastHours 13-36 to map to day 1
    # because uh max covers the previous hour. so does 10-m wind max
    # forecastHours 37-60 map to day 2
    dayForecast = int((forecastHour + 11) / 24)

    # read initializationTime from global attribute
    initializationTime = ds.attrs["initializationTime"]
    initializationTime = pd.to_datetime(initializationTime, format="%Y%m%d%H")

    #  valid_time = initializationTime + forecastHour
    valid_time = initializationTime + pd.to_timedelta(forecastHour, unit="hour")

    # I brought these assignments down here together to see if it is faster.
    # assign to coordinate
    ds = ds.assign_coords(mem=mem, dayForecast=dayForecast)
    # thought assign_coords would add dim to data_vars but it didn't
    ds = ds.expand_dims(dim=["mem", "dayForecast"])
    # no square brackets around initializationTime so it doesn't become a coordinate.
    # ds = ds.assign(initializationTime=initializationTime)
    ds = ds.assign(
        forecastHour=forecastHour,
        initializationTime=initializationTime,
        valid_time=[valid_time],
    )
    return ds

In [6]:
group = "uh"
thresholds = helicityThresholds if group == "uh" else windThresholds
genericnames = ["updraft max", "10m speed max"]
if group == "uh":
    genericnames = ["0-1km UH", "0-3km UH", "2-5km UH"]
fv3.group = group
mpas.group = group

models = [fv3, mpas]
# assert models all have same number of variables
nvars = len(genericnames)
assert all(
    [len(model.v) == nvars for model in models]
), "Models have different number of variables to analyze"
day1_forecast_hours = range(13, 37)

In [7]:
example = "/glade/campaign/mmm/parc/schwartz/HWT2024/mpas/2024052100/post/mem_1/interp_mpas_3km_2024052100_mem1_f018.nc"
ds = xarray.open_dataset(example).squeeze()  # squeeze 1-element time dimension
latitudes = ds.latitude
longitudes = ds.longitude

In [7]:
client.cluster.close()
client.shutdown()
client.close()

2024-09-19 15:48:48,026 Scheduler closing due to unknown reason...
2024-09-19 15:48:48,027 Scheduler closing all comms


In [29]:
# worked well on casper batch with 32 CPUs (36 possible but took long time in queue)
# and 30G memory
from dask.distributed import Client

client = Client(n_workers=8)  # 70)
client

2024-09-26 12:56:39,184 State start
2024-09-26 12:56:39,209 Found stale lock file and directory '/glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/scheduler-8r5q8dww', purging
2024-09-26 12:56:39,218   Scheduler at:     tcp://127.0.0.1:37851
2024-09-26 12:56:39,219   dashboard at:  https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status
2024-09-26 12:56:39,220 Registering Worker plugin shuffle
2024-09-26 12:56:39,245         Start Nanny at: 'tcp://127.0.0.1:39775'
2024-09-26 12:56:39,252         Start Nanny at: 'tcp://127.0.0.1:34461'
2024-09-26 12:56:39,255         Start Nanny at: 'tcp://127.0.0.1:36929'
2024-09-26 12:56:39,259         Start Nanny at: 'tcp://127.0.0.1:46783'
2024-09-26 12:56:39,261         Start Nanny at: 'tcp://127.0.0.1:44819'
2024-09-26 12:56:39,263         Start Nanny at: 'tcp://127.0.0.1:43273'
2024-09-26 12:56:39,265         Start Nanny at: 'tcp://127.0.0.1:40647'
2024-09-26 12:56:39,267         Start Nanny at: 'tcp://127.0.0.1:45113'
2024-09

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,Workers: 8
Total threads: 8,Total memory: 400.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37851,Workers: 8
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,Total threads: 8
Started: Just now,Total memory: 400.00 GiB

0,1
Comm: tcp://127.0.0.1:42969,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/33707/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:39775,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-yo0w8ahp,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-yo0w8ahp

0,1
Comm: tcp://127.0.0.1:42477,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/37121/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:34461,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-eajp5l7e,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-eajp5l7e

0,1
Comm: tcp://127.0.0.1:41663,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/38105/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:36929,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-thq_7w9i,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-thq_7w9i

0,1
Comm: tcp://127.0.0.1:39781,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/35259/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:46783,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-265iviqu,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-265iviqu

0,1
Comm: tcp://127.0.0.1:42271,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/46797/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:44819,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-say06085,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-say06085

0,1
Comm: tcp://127.0.0.1:37507,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/39147/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:43273,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-qgo3791z,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-qgo3791z

0,1
Comm: tcp://127.0.0.1:35561,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/42485/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:40647,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-e1234h7l,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-e1234h7l

0,1
Comm: tcp://127.0.0.1:40733,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/32839/status,Memory: 50.00 GiB
Nanny: tcp://127.0.0.1:45113,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-dmqzrel2,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-dmqzrel2


In [31]:
d2023 = pd.date_range(start=firstRun(2023), end="20230531", freq="1D")
d2024 = pd.date_range(start=firstRun(2024), end="20240531", freq="1D")
valid_dates = d2023.union(d2024)
for valid_date in valid_dates:
    rptfile = tmpdir / f"near_rpt.{valid_date}.nc"
    if os.path.exists(rptfile):
        logging.warning(f"open existing {rptfile}")
        near_rpts = xarray.open_dataarray(rptfile)
    else:
        logging.warning(f"create new {rptfile}")
        # Load severe weather reports
        near_rpts = []
        rpt_types = ["torn", "wind", "hail"]
        for rpt_type in rpt_types:
            rpt_dist_thresh_miles = 25 * units.miles
            reports = pd.read_csv(
                f"https://www.spc.noaa.gov/climo/reports/{valid_date.strftime('%y%m%d')}_rpts_{rpt_type}.csv"
            )
            print(f"read {len(reports)} {rpt_type} reports {valid_date}")

            logging.info("Create DataArray like latitudes with all elements set to False")
            near_rpt = xarray.full_like(latitudes, False, dtype=bool)
            near_rpt.name = rpt_type
            del near_rpt.attrs['units']
            del near_rpt.attrs['long_name']
            near_rpt.attrs["range"] = str(rpt_dist_thresh_miles)

            # If at least one report, put True in neighboring values
            if not reports.empty:
                # Prepare wind report coordinates for spatial indexing
                rpt_coords = reports[["Lat", "Lon"]]

                # Prepare model grid coordinates for spatial indexing
                # .ravel reads 2-d array as if it were a 1-d array.
                uh_coords = np.c_[latitudes.values.ravel(), longitudes.values.ravel()]

                # BallTree (with metric = "haversine") assumes spherical coordinates
                uh_tree = BallTree(np.deg2rad(uh_coords), metric="haversine")

                Re = metpy.constants.earth_avg_radius
                r = rpt_dist_thresh_miles.to("km") / Re
                r = r.to_base_units()
                # Find all uh points whose distance is at most threshold_distance from wind report
                results = uh_tree.query_radius(np.deg2rad(rpt_coords), r=r)
                for result in results:
                    near_rpt.values.put(result, True)

            near_rpts.append(near_rpt)
        # tried concat with dim argument but it didn't preserve coord labels
        near_rpts = xarray.merge(near_rpts).to_dataarray(dim="rpt_type", name="near_rpts")
        near_rpts.to_netcdf(rptfile)
    for model in models:
        idir = Path(f"/glade/campaign/mmm/parc/schwartz/HWT{valid_date.strftime('%Y')}/{model}")
        ncfile = (
            tmpdir
            / f"forecast_yes.{model}.{thresholds.attrs['short_name']}.{valid_date.strftime('%Y%m%d')}.nc"
        )
        if os.path.exists(ncfile) and (os.stat(ncfile).st_size > 50000):
            logging.warning(f"skip existing {ncfile}")
            continue
        else:
            logging.warning(f"create new {ncfile}")
            # Load updraft helicity data from netCDF
            fmt = "%Y%m%d%H"
            oneday = pd.to_timedelta(1, unit="day")

            # Create list of input files
            # This is a nested list comprehension, looping through
            # day1_forecast_hours (iterable of forecast hours)
            #    Model.lead_time_days (iterable of lead times in days)
            #        members (1 through Model.nmem)
            ifiles = [
                idir
                / (valid_date - lead_time_day * oneday).strftime(fmt)
                / "post"
                / f"mem_{mem}"
                / f"interp_{model}_3km_{(valid_date-lead_time_day*oneday).strftime(fmt)}_mem{mem}_f{fhr+lead_time_day*24:03d}.nc"
                for mem in range(1, model.nmem + 1)
                for lead_time_day in range(model.lead_time_days)
                for fhr in day1_forecast_hours
            ]
            ifiles = [f for f in ifiles if os.path.exists(f)]
            print(f"open {len(ifiles)} files")
            print(ifiles[0:8])
            assert len(ifiles) % 24 == 0, "# ifiles should be multiple of 24"
            ds0 = xarray.open_mfdataset(
                ifiles,
                preprocess=assigncoords,
                drop_variables=["total_precip_hrly"],
                combine_attrs="drop",
                compat="override",
                coords="minimal",
                parallel=True,
                decode_cf=False,
                decode_coords=False,
                engine="h5netcdf",  # helped with dask and HDF/lock errors
            )

            # remove time dimension
            ds0 = ds0.squeeze(dim="time", drop=True)

            # Extract updraft helicity values.
            # Reduce all valid_times
            # Reduce all members to ensemble maximum.
            logging.warning(model.v)

            # Combine multiple DataArrays associated with group "uh" or "wind"
            # into a single DataArray with a new "variable" dimension.
            # Take the maximum along the valid_time dimension and the member
            # dimension.
            # This is the 24-hour ensemble maximum. It is a 24-hour max because
            # we extracted slice `day1_forecast_hours` plus a multiple of 24
            # hours. Don't forget the dayForecast dimension. Each file contains
            # multiple forecasts of different lead times valid for the same
            # 24-hour period.
            # Each file contains the day-1, day-2, ..., day-Model.lead_time_day
            # forecasts.
            v = ds0[model.v].max(dim=["mem", "valid_time"]).to_dataarray()
            # multiple thresholds
            fy = xarray.concat(
                [v >= t for t in thresholds],
                dim="thresh",
            ).assign_coords(thresh=thresholds)
            fy.name = "ensmax"

            # Combine multiple DataArrays associated with group "uh" or "wind"
            # into a single DataArray with a new "variable" dimension.
            # Take the maximum along the valid_time dimension.
            # Count the number of members that exceed the threshold.
            # This count divided by the number of members Model.nmem is the
            # ensemble probability.
            v = (
                ds0[model.v].max(dim="valid_time").to_dataarray()
            )  # merge variables into "variable" dimension
            
            # multiple thresholds
            nfy = xarray.concat(
                [(v >= t).sum(dim="mem") for t in thresholds],
                dim="thresh",
            ).assign_coords(thresh=thresholds)
            nfy.name = "enssum"

            ds = xarray.merge(
                [fy, nfy]
            )  # don't lose units and shortname attributes of thresholds, combine_attrs="drop")
            ds.attrs["model"] = str(model)
            ds.attrs["nmem"] = model.nmem
            ds.to_netcdf(ncfile)
            logging.warning(f"saved {ncfile}")

2024-09-26 13:00:52,809 open existing /glade/derecho/scratch/ahijevyc/tmp/near_rpt.2023-04-24 00:00:00.nc
2024-09-26 13:00:52,880 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.fv3.uh.20230424.nc
2024-09-26 13:00:52,881 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20230424.nc
2024-09-26 13:00:52,881 open existing /glade/derecho/scratch/ahijevyc/tmp/near_rpt.2023-04-25 00:00:00.nc
2024-09-26 13:00:52,949 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.fv3.uh.20230425.nc
2024-09-26 13:00:52,950 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20230425.nc
2024-09-26 13:00:52,951 open existing /glade/derecho/scratch/ahijevyc/tmp/near_rpt.2023-04-26 00:00:00.nc
2024-09-26 13:00:52,997 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.fv3.uh.20230426.nc
2024-09-26 13:00:52,998 skip existing /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20230426.nc
2024-09-26 13:00:52,998 open existing /glad

In [26]:
ls -li /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20240510.nc

ls: cannot access '/glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20240510.nc': No such file or directory


In [27]:
ls -li /glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20240510.nc_bad

ls: cannot access '/glade/derecho/scratch/ahijevyc/tmp/forecast_yes.mpas.uh.20240510.nc_bad': No such file or directory
