## Convert all netCDF NCAR timestep files to Zarr 512 arrays, with Grouped Velocity components, with (64,64,64) chunk size, round-robined across FileDB nodes (spatially using Z-order)

In [1]:
desired_cube_side = 512
chunk_size = 64
raw_ncar_folder_path = '/home/idies/workspace/turb/data02_02/ariel-6-timestep-ncar-netCDF'

In [None]:
!pip install "dask[complete]"
!pip install "xarray[complete]"

In [2]:
import xarray as xr
import os
import write_tools
import math

<font color="orange">Don't delete the CD cell!</font>

In [3]:
%cd /home/idies/workspace/turb/data02_02/ariel-6-timestep-ncar-netCDF

/home/idies/workspace/turb/data02_02/ariel-6-timestep-ncar-netCDF


In [4]:
data_xr = xr.open_dataset("/home/idies/workspace/turb/data02_02/ariel-6-timestep-ncar-netCDF/jhd.000.nc")
data_xr.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (nnx: 2048, nny: 2048, nnz: 2048)
Dimensions without coordinates: nnx, nny, nnz
Data variables:
    u        (nnz, nny, nnx) float32 ...
    v        (nnz, nny, nnx) float32 ...
    w        (nnz, nny, nnx) float32 ...
    t        (nnz, nny, nnx) float32 ...
    p        (nnz, nny, nnx) float32 ...
    e        (nnz, nny, nnx) float32 ...
Attributes:
    Simulation conducted by:  Peter Sullivan, pps@ucar.edu
    Dataset built by:         Edward Patton, patton@ucar.edu
    Affiliation:              NCAR/MMM
    Code:                     NCAR-LES
    Case Description:         GABLS 1, Cooling Rate: 0.25 K/hr
    Created:                  2023-05-02 10:25:55 -0600 UTC>

## Group 3 velocity components together

<font color="red">Old Dask version gives this error https://github.com/dask/distributed/issues/3955</font>

In [None]:
merged_velocity = write_tools.merge_velocities(data_xr, chunk_size_base=chunk_size, use_dask=False)

## Unabbreviate 'e', 'p', 't' variable names

In [15]:
merged_velocity = merged_velocity.rename({'e': 'energy', 't': 'temperature', 'p': 'pressure'})

In [16]:
merged_velocity

Unnamed: 0,Array,Chunk
Bytes,25.17 MB,3.15 MB
Shape,"(128, 128, 128, 3)","(64, 64, 64, 3)"
Count,38 Tasks,8 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 25.17 MB 3.15 MB Shape (128, 128, 128, 3) (64, 64, 64, 3) Count 38 Tasks 8 Chunks Type float32 numpy.ndarray",128  1  3  128  128,

Unnamed: 0,Array,Chunk
Bytes,25.17 MB,3.15 MB
Shape,"(128, 128, 128, 3)","(64, 64, 64, 3)"
Count,38 Tasks,8 Chunks
Type,float32,numpy.ndarray


### Split 2048^3 into smaller 512^3 arrays

In [8]:
dims = [dim for dim in data_xr.dims]
smaller_groups = write_tools.split_zarr_group(merged_velocity, 64, dims)

In [10]:
# smaller_groups should be a cube (list of lists of lists)
smaller_groups[0][0][0]

Unnamed: 0,Array,Chunk
Bytes,3.15 MB,3.15 MB
Shape,"(64, 64, 64, 3)","(64, 64, 64, 3)"
Count,39 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.15 MB 3.15 MB Shape (64, 64, 64, 3) (64, 64, 64, 3) Count 39 Tasks 1 Chunks Type float32 numpy.ndarray",64  1  3  64  64,

Unnamed: 0,Array,Chunk
Bytes,3.15 MB,3.15 MB
Shape,"(64, 64, 64, 3)","(64, 64, 64, 3)"
Count,39 Tasks,1 Chunks
Type,float32,numpy.ndarray


## Z-order the smaller Arrays

In [None]:
smaller_groups_count = len(smaller_groups)

# Z-order the cube of points so they "linearize" far from each other
cube_root = math.pow(smaller_groups_count, 1/3)

## Distribute them across FileDB

### Get target Folder list

In [4]:
folders=write_tools.list_fileDB_folders()

# Avoiding 7-2 and 9-2 - they're too full as of May 2023
# folders.remove("/home/idies/workspace/turb/data02_02/zarr/ncar-zarr/")
folders.remove("/home/idies/workspace/turb/data09_02/zarr/")
folders.remove("/home/idies/workspace/turb/data07_02/zarr/") # This is already created

folders[:5]

['/home/idies/workspace/turb/data01_01/zarr/',
 '/home/idies/workspace/turb/data02_01/zarr/',
 '/home/idies/workspace/turb/data03_01/zarr/',
 '/home/idies/workspace/turb/data04_01/zarr/',
 '/home/idies/workspace/turb/data05_01/zarr/']

In [None]:
i = 0

# Possible Parallel implementation issue:
# Ariel: possible contention on data_xr = xr.open_dataset(file_name)
#     bcs all source data live on data02_02
for file_name in os.listdir(raw_ncar_folder_path):
    if os.path.isfile(os.path.join(raw_ncar_folder_path, file_name)):
        # https://github.com/pangeo-data/pangeo/issues/150
        data_xr = xr.open_dataset(file_name)
    
        chunk_size_base = 64

        # Disable compression, set chunk size
        encoding = {variable_name: {'compressor': None, 'chunks': (chunk_size_base, chunk_size_base, chunk_size_base)} for variable_name in data_xr.variables}
        
        target_dir = folders[i]

        # overwrite if exists
        data_xr.to_zarr(store=target_dir + "ncar_" + str(i) + "_" + "chunk_" + str(chunk_size_base) + ".zarr",
                        mode="w",
                       encoding = encoding)
        
        print(file_name)
        i += 1
