In [2]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob
from datetime import datetime
import time

def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []
    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"            
        file = glob.glob(f"{prefix}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

# Invariants - but should be made configurable
year = 2002
prefix = f"/fsx/eodc/eodc/mursst_netcdf/{year}"
chunks = {'time': 5, 'lat': 1799, 'lon': 3600}
path = 'x'.join(map(str, chunks.values()))
store_dir = f"/fsx/eodc/eodc/mursst_zarr/{path}_encoded"
numcodecs.blosc.use_threads = False
print(f"zarr store directory: {store_dir}")

zarr store directory: /fsx/eodc/eodc/mursst_zarr/5x1799x3600_encoded


In [3]:
cluster = LocalCluster(n_workers=4)
client = Client(cluster)
print(f"Dask client {client}")

Dask client <Client: 'tcp://127.0.0.1:34197' processes=4 threads=32, memory=125.83 GB>


In [14]:
%%time
# Loop and append
start_doy = 362
end_doy = start_doy
number_batches_to_append = 1
batch_size = 4
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size
    end_doy = min(366, end_doy)
    fileObjs = generate_file_list(start_doy, end_doy)
    first_file = fileObjs[0].split('/')[-1]
    last_file = fileObjs[-1].split('/')[-1]
    print(f"start doy: {start_doy}, file: {first_file}")
    print(f"end doy: {end_doy}, file: {last_file}")
    args = {'consolidated': True}
    # Either append or initiate store
    if start_doy == 152 and year == 2002:
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', mask_and_scale=False)
        ds = ds.chunk(chunks)       
        args['mode'] = 'w'
    else:
        # Check here that the next day we will append is the next day in the year
        current_ds = xr.open_zarr(store_dir, consolidated=True)
        next_day = current_ds.time[-1].values + np.timedelta64(1,'D')
        next_day_str = str(next_day)[0:10].replace('-', '') 
        if not (first_file[0:8] == next_day_str):
            raise Exception("starting file is not the next day of the year")
            break
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords')
        ds = ds.chunk(chunks)        
        args['mode'] = 'a'
        args['append_dim'] = 'time'
    ds.to_zarr(store_dir, **args)
    start_doy = end_doy
    print(f"Done with this batch")
    print()

start doy: 362, file: 20021228090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 366, file: 20021231090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

CPU times: user 4.5 s, sys: 467 ms, total: 4.96 s
Wall time: 50.9 s


In [15]:
ds_zarr = xr.open_zarr(store_dir, consolidated=True, mask_and_scale=False)
ds_zarr

## Test the output

Assuming we are using 1x1799x3600

In [5]:
%%time
time_slice = slice(datetime.strptime(f"{year}-06-04", '%Y-%m-%d'), datetime.strptime(f"{year}-06-08", '%Y-%m-%d'))

start_doy = 152
end_doy = 162

fileObjs = generate_file_list(start_doy, end_doy)
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords', mask_and_scale=False)
assert(ds_netcdf.dims == ds_zarr.dims)

start doy: 152, file: 20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 162, file: 20020610090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
CPU times: user 97.2 ms, sys: 20.2 ms, total: 117 ms
Wall time: 839 ms


In [6]:
ds_netcdf_masked = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')

In [7]:
%%time
print(ds_netcdf_masked.analysed_sst[5:8,:,:].sel(lat=slice(0,50),lon=slice(-170,-110)).mean().values)
ds_netcdf_masked.analysed_sst.sel(time=time_slice).mean().values 

294.50433
CPU times: user 1.56 s, sys: 93.1 ms, total: 1.65 s
Wall time: 13.1 s


array(286.7686, dtype=float32)

In [10]:
%%time
print(ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst[5:8,:,:].sel(lat=slice(0,50),lon=slice(-170,-110)).mean().values * 0.001 + 298.15)
ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values * 0.001 + 298.15

294.5043725203148
CPU times: user 2.15 s, sys: 211 ms, total: 2.36 s
Wall time: 12 s


286.7688028626251

In [11]:
%%time
print(ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst[5:8,:,:].sel(lat=slice(0,50),lon=slice(-170,-110)).mean().values * 0.001 + 298.15)
ds_zarr.where(ds_netcdf.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values * 0.001 + 298.15

294.5043725203148
CPU times: user 2.89 s, sys: 222 ms, total: 3.11 s
Wall time: 12.3 s


286.7688028626251