This notebook creates a zarr dataset from all the files stored in the directory `'/storage/GOES/orthorectified/Fog2022_withtime/'`. The zarr dataset is stored at the path `'/storage/GOES/orthorectified/Fog2022_withtime_tmp.zarr'`. The saved zarr dataset is optimized for access along the time-index, i.e. it makes it very quick to grab time series for individual pixels. 

In [1]:
import os
import glob
import shutil
import xarray as xr
import zarr
from dask.distributed import Client, LocalCluster

In [2]:
fixed_image_folder = '/storage/GOES/orthorectified/Fog2022_withtime/'
zarr_output_path = '/storage/GOES/orthorectified/Fog2022_withtime.zarr'

In [None]:
# Create a Dask cluster so we can watch the dask dashboard
workers = 6
ip_addres = 'http://j-lundquist-3.ce.washington.edu'
port=':8787'
threads = 2
cluster = LocalCluster(n_workers=workers, threads_per_worker=threads, dashboard_address=port)
client = Client(cluster)

In [16]:
# Grab all the netcdf files and sort them by the order of the start time when the images were taken
def get_start_date_from_G17_filename(s):
    return s.split('_G17_s')[1].split('_')[0]

nc_files = sorted(
    glob.glob(os.path.join(fixed_image_folder, '*.nc')),
    key=get_start_date_from_G17_filename
)

In [18]:
# The first files should be from May 1st which is the 121st day of the year, indicated by the string "s2022121_"
nc_files[:10]

['/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210001177_e20221210003550_c20221210007095_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210006177_e20221210008550_c20221210012545_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210011177_e20221210013550_c20221210016212_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210016177_e20221210018550_c20221210022416_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210021177_e20221210023550_c20221210026311_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210026177_e20221210028550_c20221210031390_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210031177_e20221210033550_c20221210037043_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20221210036177_e20221210038550_c20221210042054

In [17]:
# The last files should be from September 30th which is the 273rd day of the year, indicated by the string "s2022273_"
nc_files[-10:]

['/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732306176_e20222732308549_c20222732311046_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732311176_e20222732313550_c20222732316375_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732316176_e20222732318549_c20222732322081_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732321176_e20222732323549_c20222732326046_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732326176_e20222732328549_c20222732331338_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732331177_e20222732333549_c20222732336418_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732336177_e20222732338549_c20222732341278_o.nc',
 '/storage/GOES/orthorectified/Fog2022_withtime/OR_ABI-L2-ACHAC-M6_G17_s20222732341177_e20222732343550_c20222732346022

In [19]:
# Open all the files
ds = xr.open_mfdataset(nc_files, chunks={'time': 500})

In [29]:
ds['HT']

Unnamed: 0,Array,Chunk
Bytes,12.25 GiB,315.06 kiB
Shape,"(40785, 284, 284)","(1, 284, 284)"
Dask graph,40785 chunks in 81571 graph layers,40785 chunks in 81571 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 12.25 GiB 315.06 kiB Shape (40785, 284, 284) (1, 284, 284) Dask graph 40785 chunks in 81571 graph layers Data type float32 numpy.ndarray",284  284  40785,

Unnamed: 0,Array,Chunk
Bytes,12.25 GiB,315.06 kiB
Shape,"(40785, 284, 284)","(1, 284, 284)"
Dask graph,40785 chunks in 81571 graph layers,40785 chunks in 81571 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [31]:
ds['HT'].data.shape # This is the dimensions of the dataArray

(40785, 284, 284)

In [32]:
# Dask's rechunk documentation: https://docs.dask.org/en/stable/generated/dask.array.rechunk.html

# 0:-1 specifies that we want the dataset to be chunked along the 0 dimension, which means that each chunk will have all 40 thousand values in time dimension
# 1:'auto', 2:'auto' and balance=True specifies that dask can freely rechunk along the latitude and longitude dimensions to attain blocks that have a uniform size
arr = ds['HT'].data.rechunk(
    {0:-1, 1:'auto', 2:'auto'}, 
    block_size_limit=1e8, 
    balance=True
)

t,y,x = arr.chunks[0][0], arr.chunks[1][0], arr.chunks[2][0]

In [34]:
# Remove anything that may exist in this output path
shutil.rmtree(zarr_output_path, ignore_errors=False)

In [35]:
ds = xr.open_dataset(
    tmp_zarr_output_path,
    chunks={'time': t, 'latitude': y, 'longitude':x},
    engine='zarr'
)

ds['HT'].encoding = {'chunks': (t, y, x)}

ds.to_zarr(zarr_output_path)



<xarray.backends.zarr.ZarrStore at 0x7fccfd0499a0>

In [36]:
# Display 
source_group = zarr.open(zarr_output_path)
source_array = source_group['HT']
print(source_group.tree())
print(source_array.info)
del source_group
del source_array

/
 ├── HT (40785, 284, 284) float32
 ├── latitude (284,) float64
 ├── longitude (284,) float64
 ├── spatial_ref (40785,) int64
 └── time (40785,) int64
Name               : /HT
Type               : zarr.core.Array
Data type          : float32
Shape              : (40785, 284, 284)
Chunk shape        : (40785, 24, 24)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : zarr.storage.DirectoryStore
No. bytes          : 13158219840 (12.3G)
No. bytes stored   : 124624822 (118.9M)
Storage ratio      : 105.6
Chunks initialized : 144/144

