In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob
from datetime import datetime
import time

def generate_file_list(start_doy, end_doy, year):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []
    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"            
        file = glob.glob(f"{netcdf_prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

# Invariants - but should be made configurable
netcdf_prefix = f"/fsx/eodc/mursst_netcdf"
chunks = {'time': 5, 'lat': 1799, 'lon': 3600}
path = 'x'.join(map(str, chunks.values()))
# CLI Argument - zarr directory
store_dir = f"/fsx/eodc/mursst_zarr/2002-2004"
numcodecs.blosc.use_threads = False
print(f"zarr store directory: {store_dir}")

zarr store directory: /fsx/eodc/mursst_zarr/2002-2004


In [3]:
cluster = LocalCluster(n_workers=4)
client = Client(cluster)
print(f"Dask client {client}")

Dask client <Client: 'tcp://127.0.0.1:39705' processes=4 threads=64, memory=264.58 GB>


In [21]:
xr.open_zarr(store_dir, consolidated=True)

In [20]:
%%time
# Loop and append
# Command Line Argument - year, start_day, number_batches_to_append, batch_size
year = 2005
start_doy = 6
end_doy = start_doy
number_batches_to_append = 72
batch_size = 5
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size
    end_doy = min(367, end_doy)
    fileObjs = generate_file_list(start_doy, end_doy, year)
    first_file = fileObjs[0].split('/')[-1]
    last_file = fileObjs[-1].split('/')[-1]
    print(f"start doy: {start_doy}, starting file: {first_file}")
    print(f"end doy: {end_doy}, ending file: {last_file}")
    args = {'consolidated': True}
    # Either append or initiate store
    if start_doy == 152 and year == 2002:
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', mask_and_scale=False)
        ds = ds.chunk(chunks)       
        args['mode'] = 'w'
    else:
        # Check here that the next day we will append is the next day in the year
        current_ds = xr.open_zarr(store_dir, consolidated=True)
        next_day = current_ds.time[-1].values + np.timedelta64(1, 'D')
        next_day_str = str(next_day)[0:10].replace('-', '') 
        if not (first_file[0:8] == next_day_str):
            raise Exception("starting file is not the next day of the year")
            break
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords')
        ds = ds.chunk(chunks)        
        args['mode'] = 'a'
        args['append_dim'] = 'time'
    #ds.to_zarr(store_dir, **args)
    start_doy = end_doy
    print(f"Done with this batch")
    print()

start doy: 6, starting file: 20050106090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 11, ending file: 20050110090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 11, starting file: 20050111090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 16, ending file: 20050115090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 16, starting file: 20050116090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 21, ending file: 20050120090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 21, starting file: 20050121090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 26, ending file: 20050125090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 26, starting file: 20050126090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 31, ending file: 20050130090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

s

In [25]:
%%time
time_slice = slice(datetime.strptime(f"2005-12-21", '%Y-%m-%d'), datetime.strptime(f"2005-12-31", '%Y-%m-%d'))

fileObjs = generate_file_list(152, 366, 2002)
fileObjs_2003 = generate_file_list(1, 366, 2003)
fileObjs_2004 = generate_file_list(1, 367, 2004)
fileObjs_2005 = generate_file_list(1, 366, 2005)
fileObjs.extend(fileObjs_2003)
fileObjs.extend(fileObjs_2004)
fileObjs.extend(fileObjs_2005)
print(f"starting file: {fileObjs[0].split('/')[-1]}")
print(f"ending file: {fileObjs[-1].split('/')[-1]}")          
ds_netcdf = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', mask_and_scale=False)
ds_zarr = xr.open_zarr(store_dir, consolidated=True, mask_and_scale=False)
assert(ds_netcdf.dims == ds_zarr.dims)

starting file: 20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
ending file: 20051231090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
CPU times: user 17.8 s, sys: 2.78 s, total: 20.6 s
Wall time: 3min 56s


In [26]:
%%time
print(ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst[1300:1310,:,:].sel(lat=slice(40,50),lon=slice(-170,-160)).mean().values)
ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values

-16808.569164701432
CPU times: user 2.04 s, sys: 139 ms, total: 2.17 s
Wall time: 45.6 s


array(-11383.50954429)

In [27]:
%%time
print(ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst[1300:1310,:,:].sel(lat=slice(40,50),lon=slice(-170,-160)).mean().values)
ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values

-16808.569164701432
CPU times: user 3.96 s, sys: 289 ms, total: 4.25 s
Wall time: 7.07 s


array(-11383.50954429)

In [28]:
store_dir

'/fsx/eodc/mursst_zarr/2002-2004'