Code used to sanity-check data: plotting raw and written files, calculating various statistics

In [1]:
stored_jhf_hr_path = "/home/idies/workspace/turbulence-ceph-staging/sciserver-turbulence/stsabl2048high/stsabl2048high.zarr"
raw_jhf_hr_0_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.000.nc"
raw_jhf_hr_1_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.001.nc"
raw_jhf_hr_104_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.104.nc"

In [2]:
import zarr
import xarray as xr
import numpy as np

jhf_hr_zarr = zarr.open(stored_jhf_hr_path)
jhf_hr_netcdf_t0 = xr.open_dataset(raw_jhf_hr_0_path)
jhf_hr_netcdf_t1 = xr.open_dataset(raw_jhf_hr_1_path)
jhf_hr_netcdf_t104 = xr.open_dataset(raw_jhf_hr_104_path)

In [3]:
stored_jhf_lr_path = "/home/idies/workspace/turbulence-ceph-staging/sciserver-turbulence/stsabl2048low/stsabl2048low.zarr"
raw_jhf_lr_0_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/lr/jhf.000.nc"
raw_jhf_lr_1_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/lr/jhf.001.nc"
raw_jhf_lr_19_path = "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/lr/jhf.019.nc"


jhf_lr_zarr = zarr.open(stored_jhf_lr_path)
jhf_lr_netcdf_t0 = xr.open_dataset(raw_jhf_lr_0_path)
jhf_lr_netcdf_t1 = xr.open_dataset(raw_jhf_lr_1_path)
jhf_lr_netcdf_t19 = xr.open_dataset(raw_jhf_lr_19_path)

<font color="cyan">

# Remember, data is saved in `nnz-nny-nnx`
    
    
</font>

# Quick-Verify Correctness of data

1. Check if `data == 0`

2. Pick one $64^3$ chunk and compare it to raw NetCDF

### High-Rate

#### `zarr.info`

In [None]:
jhf_hr_zarr

In [None]:
jhf_hr_zarr['energy'].info

In [None]:
jhf_hr_zarr['velocity'].info

In [None]:
jhf_hr_zarr.info

#### Indexing, Compare to 0

In [None]:
print("Checking whether field all zeros - True is bad!")

for t in range(105):
    print("t=", t, " - ", np.all(jhf_hr_zarr['temperature'][t,:64,:64,:64,0] == 0))

#### Comparing Real Values

In [None]:
jhf_hr_zarr['temperature'][0,:10,0,0,0]

In [None]:
jhf_lr_zarr['temperature'][0,:10,0,0,0]

In [None]:
jhf_hr_zarr['temperature'][1,:10,0,0,0]

In [10]:
for t in range(93, 105):
    zarr_comparison_data = jhf_hr_zarr['temperature'][t,:64,:64,:64,0]

    raw_t_path = f"/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.{t:03d}.nc"
    raw_xr = xr.open_dataset(raw_t_path)
    raw_t = raw_xr['t'].isel(nnx=slice(0, 64), nny=slice(0, 64), nnz=slice(0, 64)).values
    
    print("t: ", t, np.all(zarr_comparison_data == raw_t))

t:  97 True
t:  98 True
t:  99 True
t:  100 True
t:  101 True
t:  102 True
t:  103 True
t:  104 True


In [None]:
jhf_hr_zarr['temperature'][0,0,0,:10,0]

In [None]:
jhf_hr_zarr['energy'][0,:10,0,0,0]

In [None]:
np.array(jhf_hr_netcdf_t0['e'][:10,0,0])

Xarray complains about missing metadata. NOT Fixed by GPT o1

`written_ds_xr = xr.open_dataset(stored_jhf_path, engine='zarr', consolidated=False)`

In [None]:
import numpy as np

np.all(ds['velocity'][0, :10,0,0, 0] == 0)

In [None]:
import numpy as np

np.all(ds['velocity'][100, :10,0,0, 0] == 0)

In [None]:
import numpy as np

np.all(ds['energy'][100, :10,0,0, 0] == 0)

### Low-Rate

#### Comparing Real Values

In [None]:
jhf_lr_zarr['temperature'][0,:10,0,0,0]

In [None]:
np.array(jhf_lr_netcdf_t0['t'][:10,0,0])

# Efficient Zarr full data Slices

This can take a few minutes

 Sciserver doesn't allow localhost connections, so can't use Dask Cluster console Sciserver doesn't allow localhost connections, so can't use Dask Cluster console
 
<font color="green"> Lazy loading speeds up reading times 10x+</font>

In [11]:
import os
import zarr
import dask.array as da
import matplotlib.pyplot as plt
from dask import compute
from dask.diagnostics import ProgressBar

plt.rcParams['image.cmap'] = 'inferno'

def process_zarr(stored_jhf_path, dataset):
    if dataset not in ['hr', 'lr']:
        raise ValueError("Dataset must be 'hr' or 'lr'.")

    # Open the Zarr group
    store = zarr.open_group(stored_jhf_path, mode='r')

    # Create Dask arrays
    data_arrays = {}
    variables = list(store.array_keys())
    for var_name in variables:
        zarr_array = store[var_name]
        dask_array = da.from_zarr(zarr_array)
        # Squeeze scalar variables
        if dask_array.shape[-1] == 1:
            dask_array = dask_array.squeeze(axis=-1)
        data_arrays[var_name] = dask_array

    # Function to collect data slices
    def collect_data_slices(data_arrays, variables, timesteps):
        data_slices = []
        titles = []
        
        for variable in variables:
            for timestep in timesteps:
                dask_array = data_arrays[variable]
                
                # Get slices along each dimension (assuming shape = [time, Z, Y, X] or similar)
                x_slice = dask_array[timestep, :, :, 0]
                y_slice = dask_array[timestep, :, 0, :]
                z_slice = dask_array[timestep, 0, :, :]
                
                # For vector variables, handle components
                if variable == 'velocity':
                    for component in range(3):
                        x_comp_slice = x_slice[..., component]
                        y_comp_slice = y_slice[..., component]
                        z_comp_slice = z_slice[..., component]
                        
                        data_slices.extend([x_comp_slice, y_comp_slice, z_comp_slice])
                        titles.extend([
                            f"{variable} (component {component}) nnx=0 Timestep={timestep}",
                            f"{variable} (component {component}) nny=0 Timestep={timestep}",
                            f"{variable} (component {component}) nnz=0 Timestep={timestep}"
                        ])
                else:
                    data_slices.extend([x_slice, y_slice, z_slice])
                    titles.extend([
                        f"{variable} nnx=0 Timestep={timestep}",
                        f"{variable} nny=0 Timestep={timestep}",
                        f"{variable} nnz=0 Timestep={timestep}"
                    ])
        return data_slices, titles

    # Choose timesteps: here every 5th until the end
    timesteps = range(0, data_arrays['energy'].shape[0], 5)  

    # Collect data slices
    data_slices, titles = collect_data_slices(data_arrays, ['energy', 'temperature', 'pressure', 'velocity'], timesteps)

    # Compute all data slices at once (lazy -> single compute call)
    with ProgressBar():
        computed_slices = compute(*data_slices)

    # Plot (and now save) all images
    # Create a master folder for all slices
    master_folder = os.path.join("zarr_slices", dataset)
    os.makedirs(master_folder, exist_ok=True)

    idx = 0
    for variable in ['energy', 'temperature', 'pressure', 'velocity']:
        # Make a folder per variable
        var_folder = os.path.join(master_folder, variable)
        os.makedirs(var_folder, exist_ok=True)
        
        # We'll have 3 slices per timestep if scalar,
        # or 9 slices per timestep if velocity (3 components * 3 slices).
        # So we figure out how many slices belong to each variable:
        if variable == 'velocity':
            slices_per_timestep = 9
        else:
            slices_per_timestep = 3
        
        for timestep in timesteps:
            # For velocity, we handle 9 images; for others, 3.
            subset = computed_slices[idx : idx + slices_per_timestep]
            subset_titles = titles[idx : idx + slices_per_timestep]
            
            for data_slice, title in zip(subset, subset_titles):
                plt.figure()
                plt.imshow(data_slice)
                plt.title(title)
                plt.gca().invert_yaxis()
                plt.colorbar()
                
                # Create a filename for saving
                safe_title = title.replace(" ", "_").replace("=", "_")
                filename = os.path.join(var_folder, f"Timestep_{timestep}_{safe_title}.png")
                plt.savefig(filename, dpi=150, bbox_inches='tight')
                plt.close()
            
            idx += slices_per_timestep

In [12]:
process_zarr(stored_jhf_lr_path, "lr")

[########################################] | 100% Completed | 52.64 s


# Data Statistics - Mean Temp. across Axis

## Zarr

### High Rate

In [13]:
stored_jhf_path = stored_jhf_hr_path

process_dataset = "high_rate"

ds = jhf_hr_zarr

In [14]:
import os
import zarr
import dask.array as da
import matplotlib.pyplot as plt
from dask import compute
from dask.diagnostics import ProgressBar

plt.rcParams['image.cmap'] = 'inferno'

# ==============
# 1) OPEN ZARR
# ==============
store = zarr.open_group(stored_jhf_path, mode='r')

data_arrays = {}
for var_name in store.array_keys():
    zarr_array = store[var_name]
    dask_array = da.from_zarr(zarr_array)
    # Squeeze scalar variables
    if dask_array.shape[-1] == 1:
        dask_array = dask_array.squeeze(axis=-1)
    data_arrays[var_name] = dask_array

# ==============
# 2) SLICE PLOTS
# ==============
def collect_data_slices(data_arrays, variables, timesteps):
    data_slices = []
    titles = []
    for variable in variables:
        for timestep in timesteps:
            dask_array = data_arrays[variable]
            # Slices for [time, Z, Y, X], we pick X=0, Y=0, Z=0
            x_slice = dask_array[timestep, :, :, 0]
            y_slice = dask_array[timestep, :, 0, :]
            z_slice = dask_array[timestep, 0, :, :]

            if variable == 'velocity':
                for component in range(3):
                    data_slices.extend([
                        x_slice[..., component],
                        y_slice[..., component],
                        z_slice[..., component]
                    ])
                    titles.extend([
                        f"{variable} (component {component}) nnx=0 Timestep={timestep}",
                        f"{variable} (component {component}) nny=0 Timestep={timestep}",
                        f"{variable} (component {component}) nnz=0 Timestep={timestep}"
                    ])
            else:
                data_slices.extend([x_slice, y_slice, z_slice])
                titles.extend([
                    f"{variable} nnx=0 Timestep={timestep}",
                    f"{variable} nny=0 Timestep={timestep}",
                    f"{variable} nnz=0 Timestep={timestep}"
                ])
    return data_slices, titles

variables = ['energy', 'temperature', 'pressure', 'velocity']
timesteps = range(0, data_arrays['energy'].shape[0], 5)

data_slices, titles = collect_data_slices(data_arrays, variables, timesteps)

# Single compute for all slice data
with ProgressBar():
    computed_slices = compute(*data_slices)

# Save slice images: "zarr_slices/variable/*.png"
master_folder = "zarr_slices"
os.makedirs(master_folder, exist_ok=True)

idx = 0
for var in variables:
    var_folder = os.path.join(master_folder, var)
    os.makedirs(var_folder, exist_ok=True)
    
    # For velocity, 9 per timestep (3 comps × 3 slices),
    # for scalars, 3 per timestep.
    slices_per_timestep = 9 if var == "velocity" else 3
    
    for t in timesteps:
        subset = computed_slices[idx : idx + slices_per_timestep]
        subset_titles = titles[idx : idx + slices_per_timestep]
        
        for data_slice, title in zip(subset, subset_titles):
            plt.figure()
            plt.imshow(data_slice)
            plt.title(title)
            plt.gca().invert_yaxis()
            plt.colorbar()

            fname = title.replace(" ", "_").replace("=", "_") + ".png"
            plt.savefig(os.path.join(var_folder, fname), dpi=150, bbox_inches='tight')
            plt.close()
        
        idx += slices_per_timestep

# ==============
# 3) MEAN TEMP
# ==============
def plot_mean_temp_across_z(
    ds, 
    timesteps,
    data_type: Literal["zarr", "original"],
    spacing: int = 128
):
    if data_type not in ("zarr", "original"):
        raise ValueError("data_type must be either 'zarr' or 'original'")

    output_folder = "mean_temperature_plots"
    os.makedirs(output_folder, exist_ok=True)

    z_indices = range(0, ds['temperature'].shape[1], spacing)

    if data_type == "zarr":
        # Build lazy ops for all timesteps
        lazy_means_per_t = []
        for t in timesteps:
            temp_slice = ds['temperature'][t, :, :, :, 0]
            temp_sliced = temp_slice[list(z_indices), :, :]
            z_means = temp_sliced.mean(axis=(1,2))
            lazy_means_per_t.append(z_means)

        # One big array: shape [num_timesteps, len(z_indices)]
        stacked = da.stack(lazy_means_per_t, axis=0)
        # Single compute
        computed = stacked.compute()

        # Plot each row
        for i, t in enumerate(timesteps):
            xy_means = computed[i]
            plt.figure()
            plt.plot(list(z_indices), xy_means, marker='o')
            plt.title(f"Mean Temperature across Z (t={t}, data_type={data_type})")
            plt.xlabel("Slice (Z)")
            plt.ylabel("Avg Temperature")
            
            fname = os.path.join(output_folder, f"mean_z_temperature_{data_type}_t_{t}.png")
            plt.savefig(fname, dpi=150, bbox_inches='tight')
            plt.close()
            print(f"Saved plot as {fname}")

    else:  # data_type == 'original'
        # Possibly still a loop, but it's real NumPy so no Dask
        for t in timesteps:
            xy_means = []
            for z in z_indices:
                xy_means.append(ds['temperature'][t, z, :, :, 0].mean())
            
            # Plot
            plt.figure()
            plt.plot(list(z_indices), xy_means, marker='o')
            plt.title(f"Mean Temperature across Z (t={t}, data_type={data_type})")
            plt.xlabel("Slice (Z)")
            plt.ylabel("Avg Temperature")
            
            fname = os.path.join(output_folder, f"mean_z_temperature_{data_type}_t_{t}.png")
            plt.savefig(fname, dpi=150, bbox_inches='tight')
            plt.close()
            print(f"Saved plot as {fname}")


plot_mean_temp_across_z(data_arrays, timesteps=range(0, 106, 5), data_type="zarr", spacing=128)

[########################################] | 100% Completed | 487.52 s


IndexError: Too many indices for array

In [17]:
ds['temperature'].info

0,1
Name,/temperature
Type,zarr.core.Array
Data type,float32
Shape,"(105, 2048, 2048, 2048, 1)"
Chunk shape,"(1, 64, 64, 64, 1)"
Order,C
Read-only,False
Compressor,
Store type,zarr.storage.DirectoryStore
No. bytes,3607772528640 (3.3T)


## Across Z

In [None]:
# Suppose you've already got a dictionary-like `ds` with:
# ds["energy"], ds["temperature"], ds["pressure"], ds["velocity"] 
# as Dask arrays (from .from_zarr), each shape ~ [time, z, y, x, (maybe 3 for velocity)]

variables = ["energy", "temperature", "pressure", "velocity"]
timesteps = range(0, ds["energy"].shape[0], 5)  # or [0, 5, 10, ..., 105]

plot_slices_and_mean(
    ds,
    variables,
    timesteps,
    data_type="zarr",    # so we do the lazy means
    spacing=128,         # how often to sample Z dimension for the mean
    master_slice_folder="zarr_slices",
    mean_output_folder="mean_temperature_plots"
)

### Original NetCDF

In [None]:
plot_mean_temp_across_z(data_arrays, timesteps=range(0, 106, 5), data_type="original", spacing=128)

## Across X

- [ ] TODO if necessary

# Compare Zarr to NetCDF correctness

In [None]:
import os
import xarray as xr
import dask.array as da
import numpy as np

def compare_zarr_and_netcdf(
    zarr_ds, 
    netcdf_path_pattern, 
    times=range(93, 105),
    z_slice=64,  # how much of Z dimension to compare
    y_slice=64,
    x_slice=64
):
    """
    Compare temperature data between a Zarr dataset and multiple NetCDF files
    for all requested timesteps at once. Prints True/False per timestep.

    Parameters
    ----------
    zarr_ds : xarray.Dataset or dict-like of Dask arrays
        Should have zarr_ds['temperature'] as a Dask-backed array of shape
        [time, z, y, x, ...].
    netcdf_path_pattern : str
        File path pattern for NetCDF files, e.g. "path/to/jhf.*.nc". 
        We'll open them with xarray.open_mfdataset(...).
    times : iterable
        Timesteps to compare. E.g. range(93, 105).
    z_slice, y_slice, x_slice : int
        Number of grid cells in each dimension to compare
        from [0:z_slice], etc.
    """

    #---------------------
    # 1) OPEN NETCDFs
    #---------------------
    # The idea is to let xarray + dask do all the heavy lifting in parallel.
    # 'concat_dim="time"' or 'combine="nested"' depends on your file structure.
    # Adjust as needed if the times are encoded differently.
    ds_nc = xr.open_mfdataset(
        netcdf_path_pattern, 
        # If each file is a single time, and you want them stacked on "time":
        concat_dim="time",
        combine="nested",
        parallel=True,
        # It's a good idea to chunk so we don't read the entire file at once
        chunks={
            "nnz": 64, 
            "nny": 64, 
            "nnx": 64  # or whatever chunk sizes make sense
        }
    )

    #---------------------
    # 2) BUILD DASK ARRAYS
    #---------------------
    # We'll slice all requested times in one shot.
    
    # Zarr data: shape might be [time, Z, Y, X, 1] if scalar
    zarr_temp = zarr_ds['temperature'].isel(
        time=times,  # e.g. [93..105)
        z=slice(0, z_slice),
        y=slice(0, y_slice),
        x=slice(0, x_slice),
        component=0  # or .squeeze(axis=-1), if it’s always 1
    )
    
    # NetCDF data: shape might be [time, nnz, nny, nnx]
    nc_temp = ds_nc['t'].isel(
        time=times,
        nnz=slice(0, z_slice),
        nny=slice(0, y_slice),
        nnx=slice(0, x_slice)
    )

    # Both 'zarr_temp' and 'nc_temp' are still lazy Dask arrays.
    # They should align to shape: [nTimes, Z, Y, X].

    #---------------------
    # 3) LAZY COMPARISON
    #---------------------
    # Instead of np.all(...) in a loop, do a single big comparison.
    # eq => shape [nTimes, Z, Y, X], a bool Dask array
    eq = (zarr_temp == nc_temp)

    # If you want a per-timestep True/False, reduce over spatial dims only:
    # eq_per_timestep => shape [nTimes]
    eq_per_timestep = eq.all(axis=(1, 2, 3))

    #---------------------
    # 4) TRIGGER COMPUTE
    #---------------------
    # This is ONE pass that will pull all the data needed from both 
    # Zarr and NetCDF, using Dask’s parallel IO.
    result = eq_per_timestep.compute()

    #---------------------
    # 5) REPORT RESULTS
    #---------------------
    # 'result' is a boolean numpy array, one entry per requested timestep
    for idx, t in enumerate(times):
        print(f"t: {t}, match: {result[idx]}")

    # Optionally close the NetCDF dataset
    ds_nc.close()

#-------------------------------------------------------------
# USAGE EXAMPLE
#-------------------------------------------------------------
if __name__ == "__main__":
    import zarr
    import xarray as xr
    
    # Suppose you already have your zarr dataset open as `jhf_hr_zarr`:
    # jhf_hr_zarr = xarray.open_zarr("path/to/zarr_dir") 
    # or zarr.open_group(...) and wrapped in an xarray Dataset
    #
    # netcdf_path_pattern might be:
    # "/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.*.nc"

    times_to_compare = range(93, 105)  # 12 timesteps
    compare_zarr_and_netcdf(
        zarr_ds=jhf_hr_zarr, 
        netcdf_path_pattern="/home/idies/workspace/turbulence-ceph-staging/ncar-jhf/hr/jhf.*.nc",
        times=times_to_compare,
        z_slice=64,
        y_slice=64,
        x_slice=64
    )