## Convert all netCDF NCAR timestep files to Zarr 512 arrays, with Grouped Velocity components, with (64,64,64) chunk size, round-robined across FileDB nodes (spatially using Z-order)

<font color="red">Old Dask version gives this error https://github.com/dask/distributed/issues/3955</font>

<font color='orange'>Note: Careful when Setting Dask `local_directory` to remote server (e.g. Temporary) will HUGELY slow down functions</font>

<font color='cyan'>Parallel version needs Large job</font>

<font color = 'gold'>TODO fix MemoryError: Unable to allocate 32.0 GiB for an array with shape (2048, 2048, 2048) and data type float32 when looping over multiple timesteps</font>

In [1]:
array_cube_side = 2048
desired_cube_side = 512
chunk_size = 64
raw_ncar_folder_path = '/home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt'
use_dask = True
dest_folder_name = "sabl2048b" # B is the high-rate data
write_type = "prod" # or "back" for backup

n_dask_workers = 4 # For Dask rechunking
num_threads = 34  # For writing to FileDB
dask_local_dir = '/home/idies/workspace/turb/data02_02'


encoding={
    "velocity": dict(chunks=(chunk_size, chunk_size, chunk_size, 3), compressor=None),
    "pressure": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None),
    "temperature": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None),
    "energy": dict(chunks=(chunk_size, chunk_size, chunk_size, 1), compressor=None)
}


timestep_nr = 0
# timestep_range = range(1) # This doesn't work with MemoryError: Unable to allocate 32.0 GiB for an array with shape (2048, 2048, 2048) and data type float32

In [2]:
%cd /home/idies/workspace/Storage/ariel4/persistent/ncar-zarr-code/zarr_writing

[Errno 2] No such file or directory: '/home/idies/workspace/Storage/ariel4/persistent/ncar-zarr-code/zarr_writing'
/Users/ariellubonja/prog/ncar-zarr-code/zarr_writing


In [3]:
from utils import write_tools
import os
import threading
import queue

### Get target Folder list

In [4]:
folders=write_tools.list_fileDB_folders()

# Avoiding 7-2 and 9-2 - they're too full as of May 2023
folders.remove("/home/idies/workspace/turb/data09_02/zarr/")
folders.remove("/home/idies/workspace/turb/data07_02/zarr/")

for i in range(len(folders)):
    folders[i] += dest_folder_name + "_" + str(i + 1).zfill(2) + "_" + write_type + "/"

# folders[:5]

# Create top-level dirs

# for folder_path in folders:
#     os.makedirs(folder_path, exist_ok=False)

<font color="orange">Don't delete the CD cell!</font>

In [5]:
%cd /home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt

[Errno 2] No such file or directory: '/home/idies/workspace/turb/data02_02/ncar-high-rate-fixed-dt'
/Users/ariellubonja/prog/ncar-zarr-code/zarr_writing


In [6]:
import xarray as xr

data_xr = xr.open_dataset("~/Downloads/jhd." + str(timestep_nr).zfill(3) + ".nc")

data_xr

In [13]:
import dask.array as da

b = da.stack([data_xr['u']], axis=3)

In [14]:
b

Unnamed: 0,Array,Chunk
Bytes,32.00 GiB,127.36 MiB
Shape,"(2048, 2048, 2048, 1)","(322, 322, 322, 1)"
Dask graph,343 chunks in 2 graph layers,343 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 32.00 GiB 127.36 MiB Shape (2048, 2048, 2048, 1) (322, 322, 322, 1) Dask graph 343 chunks in 2 graph layers Data type float32 numpy.ndarray",2048  1  1  2048  2048,

Unnamed: 0,Array,Chunk
Bytes,32.00 GiB,127.36 MiB
Shape,"(2048, 2048, 2048, 1)","(322, 322, 322, 1)"
Dask graph,343 chunks in 2 graph layers,343 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [15]:
b = b.rechunk((64,64,64,1))

In [16]:
b

Unnamed: 0,Array,Chunk
Bytes,32.00 GiB,1.00 MiB
Shape,"(2048, 2048, 2048, 1)","(64, 64, 64, 1)"
Dask graph,32768 chunks in 3 graph layers,32768 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 32.00 GiB 1.00 MiB Shape (2048, 2048, 2048, 1) (64, 64, 64, 1) Dask graph 32768 chunks in 3 graph layers Data type float32 numpy.ndarray",2048  1  1  2048  2048,

Unnamed: 0,Array,Chunk
Bytes,32.00 GiB,1.00 MiB
Shape,"(2048, 2048, 2048, 1)","(64, 64, 64, 1)"
Dask graph,32768 chunks in 3 graph layers,32768 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [6]:
cubes, range_list = write_tools.prepare_data("jhd." + str(timestep_nr).zfill(3) + ".nc")

Done preparing data. Starting to write...


In [15]:
cubes = write_tools.flatten_3d_list(cubes)
flattened_node_assgn = write_tools.flatten_3d_list(write_tools.node_assignment(4))

In [None]:
chunk_morton_mapping = write_tools.get_chunk_morton_mapping(range_list, dest_folder_name)

In [16]:
q = queue.Queue()


# Populate the queue with Write to FileDB tasks
for i in range(len(range_list)):
#     for j in range(4):
#         for k in range(4):
    min_coord = [a[0] for a in range_list[i]]
    max_coord = [a[1] - 1 for a in range_list[i]]
    
    morton = (write_tools.morton_pack(array_cube_side, min_coord[2], min_coord[1], min_coord[0]), write_tools.morton_pack(array_cube_side, max_coord[2], max_coord[1], max_coord[0]))
    
    chunk_name = write_tools.search_dict_by_value(chunk_morton_mapping, morton)
    
    idx = int(chunk_name[-2:].lstrip('0'))
    
    filedb_index = flattened_node_assgn[idx - 1] - 1
    
    destination = os.path.join(folders[filedb_index], dest_folder_name + str(idx).zfill(2) + "_" + str(timestep_nr).zfill(3) + ".zarr")
    
    current_array = cubes[i]
            
    q.put((current_array, destination, encoding))

In [None]:
# Create threads and start them

threads = []
for _ in range(num_threads):
    t = threading.Thread(target=write_tools.write_to_disk, args=(q,))
    t.start()
    threads.append(t)

# Wait for all tasks to be processed
q.join()

# Wait for all threads to finish
for t in threads:
    t.join()