In [2]:
import logging
import os
import pdb
import warnings
from pathlib import Path

import metpy.constants
import numpy as np
import pandas as pd
import seaborn as sns
import xarray
from hwt import assigncoords, firstRun, fv3, helicityThresholds, mpas, windThresholds
from matplotlib.colors import ListedColormap
from metpy.units import units
from sklearn.neighbors import BallTree
from tqdm import tqdm

sns.set_theme()
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s", force=True)
tmpdir = Path(os.getenv("TMPDIR")) / "hwt"

In [3]:
group = "wind"
thresholds = helicityThresholds if group == "uh" else windThresholds
genericnames = ["updraft max", "10m speed max"]
if group == "uh":
    genericnames = ["0-1km UH", "0-3km UH", "2-5km UH"]
fv3.group = group
mpas.group = group

models = [fv3, mpas]
# assert models all have same number of variables
nvars = len(genericnames)
assert all(
    [len(model.v) == nvars for model in models]
), "Models have different number of variables to analyze"
day1_forecast_hours = range(13, 37)
swindow = 25 # size of smoothing window in lon and lat dimensions

In [4]:
models

[fv3, mpas]

In [5]:
example = "/glade/campaign/mmm/parc/schwartz/HWT2024/mpas/2024052100/post/mem_1/interp_mpas_3km_2024052100_mem1_f018.nc"
ds = xarray.open_dataset(example).squeeze()  # squeeze 1-element time dimension
latitudes = ds.latitude
longitudes = ds.longitude

In [7]:
client.cluster.close()
client.shutdown()
client.close()

NameError: name 'client' is not defined

In [8]:
# worked well on casper batch with 32 CPUs (36 possible but took long time in queue)
# and 30G memory
from dask.distributed import Client

client = Client()
client

2025-03-18 14:05:21,369 State start
2025-03-18 14:05:21,386 Found stale lock file and directory '/glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/scheduler-fgs4e3ab', purging
2025-03-18 14:05:21,396   Scheduler at:     tcp://127.0.0.1:35429
2025-03-18 14:05:21,397   dashboard at:  https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status
2025-03-18 14:05:21,397 Registering Worker plugin shuffle
2025-03-18 14:05:21,419         Start Nanny at: 'tcp://127.0.0.1:34145'
2025-03-18 14:05:21,427         Start Nanny at: 'tcp://127.0.0.1:39217'
2025-03-18 14:05:21,429         Start Nanny at: 'tcp://127.0.0.1:44521'
2025-03-18 14:05:21,432         Start Nanny at: 'tcp://127.0.0.1:38437'
2025-03-18 14:05:23,103 Register worker addr: tcp://127.0.0.1:34811 name: 2
2025-03-18 14:05:23,104 Starting worker compute stream, tcp://127.0.0.1:34811
2025-03-18 14:05:23,105 Starting established connection to tcp://127.0.0.1:48922
2025-03-18 14:05:23,106 Register worker addr: tcp://127.0.0.

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,Workers: 4
Total threads: 4,Total memory: 40.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35429,Workers: 4
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/8787/status,Total threads: 4
Started: Just now,Total memory: 40.00 GiB

0,1
Comm: tcp://127.0.0.1:34445,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/41735/status,Memory: 10.00 GiB
Nanny: tcp://127.0.0.1:34145,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-mii19wsw,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-mii19wsw

0,1
Comm: tcp://127.0.0.1:38099,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/34099/status,Memory: 10.00 GiB
Nanny: tcp://127.0.0.1:39217,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-7q8dgyhs,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-7q8dgyhs

0,1
Comm: tcp://127.0.0.1:34811,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/34465/status,Memory: 10.00 GiB
Nanny: tcp://127.0.0.1:44521,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-vmg88d43,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-vmg88d43

0,1
Comm: tcp://127.0.0.1:33115,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/ahijevyc/proxy/36515/status,Memory: 10.00 GiB
Nanny: tcp://127.0.0.1:38437,
Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-8a49lyu4,Local directory: /glade/derecho/scratch/ahijevyc/tmp/dask-scratch-space/worker-8a49lyu4


2025-03-18 18:34:21,839 Closing Nanny gracefully at 'tcp://127.0.0.1:34145'. Reason: worker-close
2025-03-18 18:34:21,848 Closing Nanny gracefully at 'tcp://127.0.0.1:38437'. Reason: worker-close
2025-03-18 18:34:21,850 Closing Nanny gracefully at 'tcp://127.0.0.1:44521'. Reason: worker-close
2025-03-18 18:34:21,853 Received 'close-stream' from tcp://127.0.0.1:48910; closing.
2025-03-18 18:34:21,854 Closing Nanny gracefully at 'tcp://127.0.0.1:39217'. Reason: worker-close
2025-03-18 18:34:21,862 Received 'close-stream' from tcp://127.0.0.1:48938; closing.
2025-03-18 18:34:21,863 Remove worker addr: tcp://127.0.0.1:34445 name: 0 (stimulus_id='handle-worker-cleanup-1742344461.8633797')
2025-03-18 18:34:21,864 Remove worker addr: tcp://127.0.0.1:33115 name: 3 (stimulus_id='handle-worker-cleanup-1742344461.8641171')
2025-03-18 18:34:21,865 Received 'close-stream' from tcp://127.0.0.1:48922; closing.
2025-03-18 18:34:21,865 Remove worker addr: tcp://127.0.0.1:34811 name: 2 (stimulus_id='han

In [9]:
d2023 = pd.date_range(start=firstRun(2023), end="20230531", freq="1D")
d2024 = pd.date_range(start=firstRun(2024), end="20240531", freq="1D")
valid_dates = d2023.union(d2024)

logging.warning(f"look in {tmpdir}")
for valid_date in valid_dates:
    rptfile = tmpdir / f"near_rpt.{valid_date}.nc"
    if os.path.exists(rptfile):
        logging.warning(f"open existing {rptfile.name}")
        near_rpts = xarray.open_dataarray(rptfile)
    else:
        logging.warning(f"create new {rptfile}")
        # Load severe weather reports
        near_rpts = []
        rpt_types = ["torn", "wind", "hail"]
        for rpt_type in rpt_types:
            rpt_dist_thresh_miles = 25 * units.miles
            reports = pd.read_csv(
                f"https://www.spc.noaa.gov/climo/reports/{valid_date.strftime('%y%m%d')}_rpts_{rpt_type}.csv"
            )
            print(f"read {len(reports)} {rpt_type} reports {valid_date}")

            logging.info("Create DataArray like latitudes with all elements set to False")
            near_rpt = xarray.full_like(latitudes, False, dtype=bool)
            near_rpt.name = rpt_type
            del near_rpt.attrs['units']
            del near_rpt.attrs['long_name']
            near_rpt.attrs["range"] = str(rpt_dist_thresh_miles)

            # If at least one report, put True in neighboring values
            if not reports.empty:
                # Prepare wind report coordinates for spatial indexing
                rpt_coords = reports[["Lat", "Lon"]]

                # Prepare model grid coordinates for spatial indexing
                # .ravel reads 2-d array as if it were a 1-d array.
                uh_coords = np.stack(latitudes.values.ravel(), longitudes.values.ravel()).T

                # BallTree (with metric = "haversine") assumes spherical coordinates
                uh_tree = BallTree(np.deg2rad(uh_coords), metric="haversine")

                Re = metpy.constants.earth_avg_radius
                r = rpt_dist_thresh_miles.to("km") / Re
                r = r.to_base_units()
                # Find all uh points whose distance is at most threshold_distance from wind report
                results = uh_tree.query_radius(np.deg2rad(rpt_coords), r=r)
                for result in results:
                    near_rpt.values.put(result, True)

            near_rpts.append(near_rpt)
        # tried concat with dim argument but it didn't preserve coord labels
        near_rpts = xarray.merge(near_rpts).to_dataarray(dim="rpt_type", name="near_rpts")
        near_rpts.to_netcdf(rptfile)
    for model in models:
        idir = Path(f"/glade/campaign/mmm/parc/schwartz/HWT{valid_date.strftime('%Y')}/{model}")
        ncfile = (
            tmpdir
            / f"forecast_yes.{model}.{thresholds.attrs['short_name']}.{swindow}.{valid_date.strftime('%Y%m%d')}.nc"
        )
        if os.path.exists(ncfile) and (os.stat(ncfile).st_size > 50000):
            logging.warning(f"open existing {ncfile.name}")
            continue
        else:
            logging.warning(f"create new {ncfile}")
            # Load updraft helicity data from netCDF
            fmt = "%Y%m%d%H"
            oneday = pd.to_timedelta(1, unit="day")

            # Create list of input files
            # This is a nested list comprehension, looping through
            # day1_forecast_hours (iterable of forecast hours)
            #    Model.lead_time_days (iterable of lead times in days)
            #        members (1 through max_members)
            max_members = 10
            ifiles = [
                idir
                / (valid_date - lead_time_day * oneday).strftime(fmt)
                / "post"
                / f"mem_{mem}"
                / f"interp_{model}_3km_{(valid_date-lead_time_day*oneday).strftime(fmt)}_mem{mem}_f{fhr+lead_time_day*24:03d}.nc"
                for mem in range(1, max_members + 1)
                for lead_time_day in range(model.lead_time_days)
                for fhr in day1_forecast_hours
            ]
            ifiles = [f for f in ifiles if os.path.exists(f)]
            logging.warning(f"open {len(ifiles)} files")
            logging.info(ifiles[0:8])
            assert len(ifiles) % 24 == 0, "len(ifiles) should be multiple of 24"
            ds = xarray.open_mfdataset(
                ifiles,
                preprocess=assigncoords,
                data_vars=model.v,
                drop_variables=["total_precip_hrly"],
                combine_attrs="drop",
                compat="override",
                coords="minimal",
                parallel=True,
                decode_cf=False,
                decode_coords=False,
                engine="h5netcdf",  # "h5netcdf" helped with dask and HDF/lock errors
            )

            # remove time dimension
            ds = ds.squeeze(dim="time", drop=True)
        
            # Take the maximum along the valid_time dimension.
            # This is the 24-hour ensemble maximum. It is a 24-hour max because
            # we extracted slice `day1_forecast_hours` plus a multiple of 24
            # hours. Don't forget the dayForecast dimension. Each file contains
            # multiple forecasts of different lead times valid for the same
            # 24-hour period.
            # Each file contains the day-1, day-2, ..., day-Model.lead_time_day
            # forecasts.
            logging.warning("24-h max")
            ds = ds.max(dim=["valid_time"])
        
            # Combine multiple DataArrays associated with group "uh" or "wind"
            # into a single DataArray with a new "variable" dimension.
            da = ds.to_dataarray()
            
            
            center = True
            logging.warning(f"spatial smooth {swindow} {center}")
            da = da.rolling(lat=swindow, lon=swindow, min_periods=1, center=center).max()
        
            nmem = da.notnull().sum(dim="mem")
            # Count the number of members that exceed the threshold.
            # This count divided by the number of members nmem is the
            # ensemble probability.
            logging.warning(f"enssum {len(thresholds)} thresh")
            da = xarray.concat(
                [(da >= t).sum(dim="mem") for t in thresholds],
                dim="thresh",
            ).assign_coords(thresh=thresholds)
            da.name = "enssum"
            da.attrs["rolling_spatial_window"] = swindow
            da.attrs["model"] = str(model)
            da["nmem"] = nmem
        
            da.to_netcdf(ncfile)
        
            logging.warning(f"saved {ncfile}")

2025-03-18 14:05:23,169 look in /glade/derecho/scratch/ahijevyc/tmp/hwt
2025-03-18 14:05:23,171 open existing near_rpt.2023-04-24 00:00:00.nc
2025-03-18 14:05:23,179 open existing forecast_yes.fv3.wind.25.20230424.nc
2025-03-18 14:05:23,181 open existing forecast_yes.mpas.wind.25.20230424.nc
2025-03-18 14:05:23,181 open existing near_rpt.2023-04-25 00:00:00.nc
2025-03-18 14:05:23,188 open existing forecast_yes.fv3.wind.25.20230425.nc
2025-03-18 14:05:23,189 open existing forecast_yes.mpas.wind.25.20230425.nc
2025-03-18 14:05:23,189 open existing near_rpt.2023-04-26 00:00:00.nc
2025-03-18 14:05:23,195 open existing forecast_yes.fv3.wind.25.20230426.nc
2025-03-18 14:05:23,196 open existing forecast_yes.mpas.wind.25.20230426.nc
2025-03-18 14:05:23,197 open existing near_rpt.2023-04-27 00:00:00.nc
2025-03-18 14:05:23,203 open existing forecast_yes.fv3.wind.25.20230427.nc
2025-03-18 14:05:23,204 open existing forecast_yes.mpas.wind.25.20230427.nc
2025-03-18 14:05:23,204 open existing near_r