In [None]:
import warnings
import numpy as np
import pandas as pd
import xarray as xr
import fsspec

warnings.simplefilter('ignore') # filter some warning messages
xr.set_options(display_style="html")  #display dataset nicely

In [None]:
file_opendap = 'https://podaac-opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2015/001/20150101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
file_local = './../../data/20150101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
dir_out = './../../data/zarr_testing/'
file_aws = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1'
file_aws2 = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr'

fout1 = dir_out+'test_normalread_v1.zarr'
fout2 = dir_out+'test_rawread_v1.zarr'
fout3 = dir_out+'test_subset_v1.zarr'

In [None]:
%%time
ds_aws = xr.open_zarr(file_aws,consolidated=True)
#ds_aws

In [None]:
%%time
ds_aws2 = xr.open_zarr(file_aws2,consolidated=True)
#ds_aws2

In [None]:
ds_odap = xr.open_dataset(file_opendap)
#ds_odap

In [None]:
ds_local = xr.open_dataset(file_local)
#ds_local

In [None]:
#ds_local.analysed_sst.sel(lat=slice(20,30),lon=slice(-120,-100)).isel(time=0).plot()

In [None]:
file_aws = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1'
file_aws2 = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr'
ds_aws = xr.open_zarr(file_aws,consolidated=True)
ds_aws2 = xr.open_zarr(file_aws2,consolidated=True)
# print out value near land
xlat,xlon,date = 24, -100, '2015-01-01'
#print('local file',ds_local.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('opendap file',ds_odap.analysed_sst.sel(time=date,lat=xlat,lon=xlon).data)
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)

In [7]:
# print out value near land
xlat,xlon,date = 24, -100, '2015-01-01'
#print('local file',ds_local.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('opendap file',ds_odap.analysed_sst.sel(time=date,lat=xlat,lon=xlon).data)
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)

opendap file nan
aws v1 nan
aws v2 265.382


in AWS v2, this is the re-chunked version of the zarr data calculated using ryan's rechunker library.
for some reason in the .json fill_value was changed to a fill_value (-32768) 
It looks like the xarray/zarr library is applying the scale_factor (.001) and add_offset (298.15) because it isn't nan

## Can we recreate rechunker issue using a smaller version of the data?

In [None]:
%%time
ds_aws = xr.open_zarr(file_aws,consolidated=True)
#ds_aws = ds_aws.isel(time=slice(0,10),lat=slice(1000,3000),lon=slice(0,1000))
ds_aws2 = xr.open_zarr(file_aws2,consolidated=True)
#ds_aws2 = ds_aws2.isel(time=slice(0,10),lat=slice(1000,3000),lon=slice(0,1000))
print('examine chunking in arrays')
print('v1',ds_aws.analysed_sst.data)
print('v2',ds_aws2.analysed_sst.data)

# print out value near land
xlat = 24
xlon = -100
date = '2015-01-01'
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)

In [None]:
#ds_aws.analysed_sst[0,6000:7000,19000:20000].plot()

## test creating new zarr data store by 
- reading in zarr store
- rechuncking
- writing out
- reading it back in from the new store
- outputting a data point

# testing aimee rechunker code

In [None]:
# create a subset of the data to test rechunker
ds_aws = xr.open_zarr(file_aws,consolidated=True) #read in data
ds_aws = ds_aws.isel(time=slice(0,20),lat=slice(6000,7000),lon=slice(19000,20000)) #subset to reasonable size
ds_aws.to_zarr(fout3,consolidated=True)  #output data
ds_aws_subset = xr.open_zarr(fout3,consolidated=True) #test read back in
# print out value near land
xlat = -26
xlon = 18
date = '2002-06-01'
print('v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('new',ds_aws_subset.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)

In [None]:
from rechunker import rechunk
import s3fs
#import xarray as xr
import zarr
import dask.array as dsa
import shutil
from dask.diagnostics import ProgressBar
import numpy as np
import xarray as xr
import os
import fsspec
import pandas as pd

PANGEO_SCRATCH = os.environ['PANGEO_SCRATCH']
print(PANGEO_SCRATCH)
#mapper = fsspec.get_mapper(f'{PANGEO_SCRATCH}/data.zarr')

In [None]:
# load the data
ds_zarr = zarr.open_consolidated(fout3, mode='r')
print(zarr.tree(ds_zarr))

In [None]:
#rechunker plan
s3 = s3fs.S3FileSystem(client_kwargs=dict(region_name='us-west-2'),anon=False, default_fill_cache=False, skip_instance_cache=True)

s3_rechunk_store = s3fs.S3Map(root=f'{PANGEO_SCRATCH}/new-group_v5.zarr', create=True, s3=s3)

# Note this path must exist in S3 or will raise rechunker assertion, `assert temp_store_or_group is not None`
s3_tmp_store = s3fs.S3Map(root=f'{PANGEO_SCRATCH}/tmp_v5.zarr', create=True, s3=s3)

In [None]:
list(s3_rechunk_store)

In [None]:
target_chunks = {
    'analysed_sst': {'time': 20, 'lat': 100, 'lon': 100},
    'analysis_error': {'time': 20, 'lat': 100, 'lon': 100},
    'mask': {'time': 20, 'lat': 100, 'lon': 100},
    'sea_ice_fraction': {'time': 20, 'lat': 100, 'lon': 100},
    'lat': None,
    'lon': None,
    'time': None
}
max_mem = '1GB'

array_plan = rechunk(ds_zarr, target_chunks, max_mem, s3_rechunk_store, s3_tmp_store)
array_plan

In [None]:
with ProgressBar():
    array_plan.execute()

## looking at xarray .sel round off error that seems to be not selecting all the valid coordinate points

In [None]:
print(ds_local.lat[10043:10049].data)

The results above show that (as expected) the lat coordinate has .01 precision

In [None]:
print('these should all be len=3 since slice is inclusive of first/last points')
print(ds_local.lat.sel(lat=slice(10.46,10.48)).data)
print(ds_local.lat.sel(lat=slice(10.45,10.47)).data,'missing first&last points')
print(ds_local.lat.sel(lat=slice(10.43,10.45)).data)
print(ds_local.lat.sel(lat=slice(10.47,10.49)).data)
print(ds_local.lat.sel(lat=slice(11.45,11.47)).data,'missing first&last points')

In [None]:
print('these should all be len=5 since slice is inclusive of first/last points')
print(ds_local.lat.sel(lat=slice(11.46,11.50)).data)
print(ds_local.lat.sel(lat=slice(11.45,11.49)).data,'missing first point')
print(ds_local.lat.sel(lat=slice(11.43,11.47)).data,'missing last point')
print(ds_local.lat.sel(lat=slice(11.41,11.45)).data,'missing first point')

In [None]:
#put in exact lats and test
ds_local = xr.open_dataset(file_local)
ds_test = ds_local.copy(deep=True)
test_lat = np.arange(-89.99,90,.01)
ds_test = ds_test.assign_coords(lat=test_lat)
print('these should all be len=5 since slice is inclusive of first/last points')
print(ds_test.lat.sel(lat=slice(11.46,11.50)).data,'missing last point')
print(ds_test.lat.sel(lat=slice(11.45,11.49)).data,'missing last point')
print(ds_test.lat.sel(lat=slice(11.43,11.47)).data,'missing last point')
print(ds_test.lat.sel(lat=slice(11.41,11.45)).data,'missing last point')

In [None]:
#put in exact lats and test
ds_local = xr.open_dataset(file_local)
ds_test = ds_local.copy(deep=True)
ds_test = ds_test.assign_coords(lat=ds_local.lat.data)
print('these should all be len=5 since slice is inclusive of first/last points')
print(ds_test.lat.sel(lat=slice(11.46,11.50)).data)
print(ds_test.lat.sel(lat=slice(11.45,11.49)).data,'missing first point')
print(ds_test.lat.sel(lat=slice(11.43,11.47)).data,'missing last point')
print(ds_test.lat.sel(lat=slice(11.41,11.45)).data,'missing first point')

In [None]:
# test by reading REGULAR 
#first remove temp files
!rm -r ../../data/zarr_testing/*.*
#now make temp rechunked file
ds_aws = xr.open_zarr(file_aws,consolidated=True) #read in data
ds_aws = ds_aws.isel(time=slice(0,20),lat=slice(6000,7000),lon=slice(19000,20000)) #subset to reasonable size
ds_aws_rechunked = ds_aws.chunk({'time':20,'lat':100,'lon':100}).compute() #rechunk data
ds_aws_rechunked.to_zarr(fout1,consolidated=True)  #output data
ds_aws3 = xr.open_zarr(fout1,consolidated=True) #test read back in
# print out value near land
xlat = -26
xlon = 18
date = '2002-06-01'
print('v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('new',ds_aws3.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)