In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob
from datetime import datetime
import time
import s3fs

In [2]:
cluster = LocalCluster(n_workers=4)
client = Client(cluster)
print(f"Dask client {client}")

Dask client <Client: 'tcp://127.0.0.1:33511' processes=4 threads=64, memory=535.16 GB>


In [3]:
def generate_file_list(start_doy, end_doy, year):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []
    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"            
        file = glob.glob(f"{netcdf_prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

In [4]:
# Invariants - but should be made configurable
netcdf_prefix = f"/s3fsx/eodc/mursst_netcdf"
chunks = {'time': 5, 'lat': 1799, 'lon': 3600}
path = 'x'.join(map(str, chunks.values()))
# CLI Argument - zarr directory
store_dir = f"/s3fsx/eodc/mursst_zarr/5x1799x3600"
numcodecs.blosc.use_threads = False
print(f"zarr store directory: {store_dir}")

zarr store directory: /s3fsx/eodc/mursst_zarr/5x1799x3600


In [9]:
%%time
ds_zarr = xr.open_zarr(store_dir, consolidated=True, mask_and_scale=False)
ds_zarr



CPU times: user 3.47 s, sys: 140 ms, total: 3.61 s
Wall time: 6.25 s


In [8]:
%%time
# Loop and append
# Command Line Argument - year, start_day, number_batches_to_append, batch_size
year = 2016
start_doy = 366
end_doy = start_doy
number_batches_to_append = 1
batch_size = 5
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size
    end_doy = min(367, end_doy)
    fileObjs = generate_file_list(start_doy, end_doy, year)
    first_file = fileObjs[0].split('/')[-1]
    last_file = fileObjs[-1].split('/')[-1]
    print(f"start doy: {start_doy}, starting file: {first_file}")
    print(f"end doy: {end_doy}, ending file: {last_file}")
    args = {'consolidated': True}
    # Either append or initiate store
    if start_doy == 152 and year == 2002:
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', mask_and_scale=False)
        ds = ds.chunk(chunks)       
        args['mode'] = 'w'
    else:
        # Check here that the next day we will append is the next day in the year
        current_ds = xr.open_zarr(store_dir, consolidated=True)
        next_day = current_ds.time[-1].values + np.timedelta64(1, 'D')
        next_day_str = str(next_day)[0:10].replace('-', '') 
        if not (first_file[0:8] == next_day_str):
            raise Exception("starting file is not the next day of the year")
            break
        drop_vars = ['dt_1km_data', 'sst_anomaly']
        ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', drop_variables=drop_vars)
        ds = ds.chunk(chunks)        
        args['mode'] = 'a'
        args['append_dim'] = 'time'
    ds.to_zarr(store_dir, **args)
    start_doy = end_doy
    print(f"Done with this batch")
    print()

start doy: 366, starting file: 20161231090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 367, ending file: 20161231090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




Done with this batch



IndexError: list index out of range

In [10]:
time_slice = slice(datetime.strptime(f"2016-12-21", '%Y-%m-%d'), datetime.strptime(f"2016-12-31", '%Y-%m-%d'))

In [11]:
%%time
fileObjs = generate_file_list(1, 367, 2016)
print(f"starting file: {fileObjs[0].split('/')[-1]}")
print(f"ending file: {fileObjs[-1].split('/')[-1]}")
drop_vars = ['dt_1km_data', 'sst_anomaly']
ds_netcdf = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords', mask_and_scale=False, drop_variables=drop_vars)
ds_netcdf

starting file: 20160101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
ending file: 20161231090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
CPU times: user 1.89 s, sys: 760 ms, total: 2.65 s
Wall time: 8.7 s


In [12]:
%%time
print(ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst[356:366,:,:].sel(lat=slice(40,50),lon=slice(-170,-160)).mean().values)
ds_netcdf.where(ds_netcdf.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values

-17803.61192264279
CPU times: user 973 ms, sys: 155 ms, total: 1.13 s
Wall time: 38.9 s


array(-11290.59658304)

In [13]:
%%time
print(ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst[5318:5328,:,:].sel(lat=slice(40,50),lon=slice(-170,-160)).mean().values)
ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values

-17803.61192264279




CPU times: user 10.8 s, sys: 481 ms, total: 11.3 s
Wall time: 28.1 s


array(-11290.59658304)

In [None]:
import s3fs

s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='us-east-1'))
s3_store = s3fs.S3Map(root='nasa-eodc/eodc/mursst_zarr/5x1799x3600', s3=s3, check=False)
ds_zarr = xr.open_zarr(s3_store, consolidated=True, mask_and_scale=False)

In [None]:
%%time
print(ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst[4577:4597,:,:].sel(lat=slice(40,50),lon=slice(-170,-160)).mean().values)
ds_zarr.where(ds_zarr.analysed_sst != -32768).analysed_sst.sel(time=time_slice).mean().values

In [14]:
!ls /s3fsx/eodc/mursst_netcdf/2017/**/*.nc | wc -l

11
