## Convert all netCDF NCAR timestep files to Zarr 512 arrays, with Grouped Velocity components, with (64,64,64) chunk size, round-robined across FileDB nodes (spatially using Z-order)

<font color="red">Old Dask version gives this error https://github.com/dask/distributed/issues/3955</font>

<font color='red'>Note: Careful when Setting Dask `local_directory` to remote server (e.g. Temporary) will HUGELY slow down functions</font>

In [1]:
desired_cube_side = 512
chunk_size = 64
raw_ncar_folder_path = '/home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt'
use_dask = True
dest_folder_name = "sabl2048b" # B is the high-rate data
write_type = "prod" # or "back"

n_dask_workers = 4

timestep_nr = 1
# timestep_range = range(1) # Ned's new High-rate fixed-dt has only 5 timesteps
# timestep_range = range(3,5)

In [2]:
%cd /home/idies/workspace/Storage/ariel4/persistent/ncar-zarr-code/zarr_writing

/home/idies/workspace/Storage/ariel4/persistent/ncar-zarr-code/zarr_writing


In [3]:
import xarray as xr
from utils import write_tools
import dask
import os

### Get target Folder list

In [4]:
folders=write_tools.list_fileDB_folders()

# Avoiding 7-2 and 9-2 - they're too full as of May 2023
folders.remove("/home/idies/workspace/turb/data09_02/zarr/")
folders.remove("/home/idies/workspace/turb/data07_02/zarr/")

for i in range(len(folders)):
    folders[i] += dest_folder_name + "_" + str(i + 1).zfill(2) + "_" + write_type + "/"

# folders[:5]

# Create top-level dirs

# for folder_path in folders:
#     os.makedirs(folder_path, exist_ok=False)

<font color="orange">Don't delete the CD cell!</font>

In [5]:
%cd /home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt

/home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt


In [6]:
# for timestep_nr in timestep_range:
data_xr = xr.open_dataset(raw_ncar_folder_path + "/jhd." + str(timestep_nr).zfill(3) + ".nc")

# Group 3 velocity components together
# This fails with Dask bcs. of write permission error on SciServer Job
# Never use dask with remote location on this!!
merged_velocity = write_tools.merge_velocities(data_xr, chunk_size_base=chunk_size, use_dask=True, n_dask_workers=n_dask_workers)


# Unabbreviate 'e', 'p', 't' variable names
merged_velocity = merged_velocity.rename({'e': 'energy', 't': 'temperature', 'p': 'pressure'})

# Split 2048^3 into smaller 512^3 arrays
dims = [dim for dim in data_xr.dims]
dims.reverse() # use (nnz, nny, nnx) instead of (nnx, nny, nnz)

smaller_groups, _ = write_tools.split_zarr_group(merged_velocity, desired_cube_side, dims)

# Given up in favor of Ryan's node coloring technique
#     z_order = write_tools.morton_order_cube(cube_side=4)

node_assignments = write_tools.node_assignment(cube_side=4)


# Distribute them across FileDB
cubes = smaller_groups

encoding={
    "velocity": dict(chunks=(chunk_size, chunk_size, chunk_size, 3), compressor=None),
    "pressure": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None),
    "temperature": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None),
    "energy": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None)
}

print('Done preparing data. Starting to write...')

Done preparing data. Starting to write...


In [11]:
# Do not use Dask here! Always Kernel Died, even with only 2 workers

tasks = []
for i in range(len(cubes)):
    for j in range(len(cubes[i])):
        for k in range(len(cubes[i][j])):
            filedb_index = node_assignments[i][j][k]# % len(folders) # ryan's node assignment accounts for nr. nodes on filedb
            current_array = cubes[i][j][k]
            
            chunk_nr = 16 * i + 4 * j + k

            # turb/data02_02/sabl2048b_prod/ + sabl2048b + 05 + _ + 001.zarr
            dest_groupname = os.path.join(folders[filedb_index - 1], dest_folder_name + str(chunk_nr + 1).zfill(2) + "_" + str(timestep_nr).zfill(3) + ".zarr")

            write_tools.write_to_disk(dest_groupname, current_array, encoding)

Done writing /home/idies/workspace/turb/data01_01/zarr/sabl2048b_01_prod/sabl2048b01_001.zarr
Done writing /home/idies/workspace/turb/data02_01/zarr/sabl2048b_02_prod/sabl2048b02_001.zarr
Done writing /home/idies/workspace/turb/data03_01/zarr/sabl2048b_03_prod/sabl2048b03_001.zarr
Done writing /home/idies/workspace/turb/data04_01/zarr/sabl2048b_04_prod/sabl2048b04_001.zarr
Done writing /home/idies/workspace/turb/data05_01/zarr/sabl2048b_05_prod/sabl2048b05_001.zarr
Done writing /home/idies/workspace/turb/data06_01/zarr/sabl2048b_06_prod/sabl2048b06_001.zarr
Done writing /home/idies/workspace/turb/data07_01/zarr/sabl2048b_07_prod/sabl2048b07_001.zarr
Done writing /home/idies/workspace/turb/data08_01/zarr/sabl2048b_08_prod/sabl2048b08_001.zarr
Done writing /home/idies/workspace/turb/data09_01/zarr/sabl2048b_09_prod/sabl2048b09_001.zarr
Done writing /home/idies/workspace/turb/data10_01/zarr/sabl2048b_10_prod/sabl2048b10_001.zarr
Done writing /home/idies/workspace/turb/data11_01/zarr/sabl2