In [1]:
import warnings
import numpy as np
import pandas as pd
import xarray as xr
import fsspec

warnings.simplefilter('ignore') # filter some warning messages
xr.set_options(display_style="html")  #display dataset nicely

<xarray.core.options.set_options at 0x7fe52034f940>

In [2]:
file_opendap = 'https://podaac-opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2015/001/20150101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
file_local = './../../data/20150101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
dir_out = './../../data/zarr_testing/'
file_aws = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1'
file_aws2 = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr'

fout1 = dir_out+'test_normalread_v1.zarr'
fout2 = dir_out+'test_rawread_v1.zarr'

In [3]:
%%time
ds_aws = xr.open_zarr(file_aws,consolidated=True)
#ds_aws

CPU times: user 1.57 s, sys: 99 ms, total: 1.67 s
Wall time: 2.03 s


In [4]:
%%time
ds_aws2 = xr.open_zarr(file_aws2,consolidated=True)
#ds_aws2

CPU times: user 638 ms, sys: 43.5 ms, total: 681 ms
Wall time: 802 ms


In [5]:
ds_odap = xr.open_dataset(file_opendap)
#ds_odap

In [6]:
ds_local = xr.open_dataset(file_local)
#ds_local

In [7]:
#ds_local.analysed_sst.sel(lat=slice(20,30),lon=slice(-120,-100)).isel(time=0).plot()

In [8]:
# print out value near land
xlat = 24
xlon = -100
date = '2015-01-01'
print('local file',ds_local.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('opendap file',ds_odap.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)

local file nan
opendap file nan
aws v1 nan
aws v2 265.382


In [9]:
#ds_local.analysed_sst.sel(lat=slice(20,30),lon=slice(100,120)).isel(time=0).plot()

In [10]:
# print out value near land
xlat = 21
xlon = 101
date = '2015-01-01'
print('local file',ds_local.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('opendap file',ds_odap.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).data)
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)

local file nan
opendap file nan
aws v1 nan
aws v2 265.382


## looking at aws v1 versus aws 

In [11]:
%%time
ds_aws = xr.open_zarr(file_aws,consolidated=True)
#ds_aws = ds_aws.isel(time=slice(0,10),lat=slice(1000,3000),lon=slice(0,1000))
ds_aws2 = xr.open_zarr(file_aws2,consolidated=True)
#ds_aws2 = ds_aws2.isel(time=slice(0,10),lat=slice(1000,3000),lon=slice(0,1000))
print('examine chunking in arrays')
print('v1',ds_aws.analysed_sst.data)
print('v2',ds_aws2.analysed_sst.data)

# print out value near land
xlat = 24
xlon = -100
date = '2015-01-01'
print('aws v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)
print('aws v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).compute().data)

examine chunking in arrays
v1 dask.array<xarray-analysed_sst, shape=(6443, 17999, 36000), dtype=float32, chunksize=(5, 1799, 3600), chunktype=numpy.ndarray>
v2 dask.array<xarray-analysed_sst, shape=(6443, 17999, 36000), dtype=float32, chunksize=(6443, 100, 100), chunktype=numpy.ndarray>
aws v1 [nan]
aws v2 [265.382]
CPU times: user 2.16 s, sys: 256 ms, total: 2.41 s
Wall time: 2.77 s


In [12]:
#ds_aws.analysed_sst[0,6000:7000,19000:20000].plot()

## test creating new zarr data store by 
- reading in zarr store
- rechuncking
- writing out
- reading it back in from the new store
- outputting a data point

In [13]:
# test by reading REGULAR 
ds_aws = xr.open_zarr(file_aws,consolidated=True) #read in data
ds_aws = ds_aws.isel(time=slice(0,10),lat=slice(6000,7000),lon=slice(19000,20000)) #subset to reasonable size
ds_aws_rechunked = ds_aws.chunk({'time':10,'lat':100,'lon':100}).compute() #rechunk data
ds_aws_rechunked.to_zarr(fout1)  #output data
ds_aws3 = xr.open_zarr(fout1) #test read back in
# print out value near land
xlat = -26
xlon = 18
date = '2002-06-01'
print('v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('new',ds_aws3.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)

v1 nan
v2 265.382
new nan


In [14]:
# test by reading WITHOUT cf-convention 
ds_aws_raw = xr.open_zarr(file_aws,consolidated=True,decode_cf=False, mask_and_scale=False, decode_times=False) #read in data
ds_aws_raw = ds_aws_raw.isel(time=slice(0,10),lat=slice(6000,7000),lon=slice(19000,20000)) #subset to reasonable size
ds_aws_raw_rechunked = ds_aws_raw.chunk({'time':10,'lat':100,'lon':100}).compute() #rechunk data
ds_aws_raw_rechunked.to_zarr(fout2)  #output data
ds_aws4 = xr.open_zarr(fout2) #test read back in
# print out value near land
xlat = -26
xlon = 18
date = '2002-06-01'
print('v1',ds_aws.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('v2',ds_aws2.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('new',ds_aws3.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)
print('new_raw',ds_aws4.analysed_sst.sel(time=date,lat=xlat,lon=xlon).isel(time=0).compute().data)

v1 nan
v2 265.382
new nan
new_raw nan


## looking at xarray .sel round off error that seems to be not selecting all the valid coordinate points

In [15]:
print(ds_local.lat[10043:10049].data)

[10.44 10.45 10.46 10.47 10.48 10.49]


The results above show that (as expected) the lat coordinate has .01 precision

In [16]:
print('these should all be len=3 since slice is inclusive of first/last points')
print(ds_local.lat.sel(lat=slice(10.46,10.48)).data)
print(ds_local.lat.sel(lat=slice(10.45,10.47)).data,'missing first&last points')
print(ds_local.lat.sel(lat=slice(10.43,10.45)).data)
print(ds_local.lat.sel(lat=slice(10.47,10.49)).data)
print(ds_local.lat.sel(lat=slice(11.45,11.47)).data,'missing first&last points')

these should all be len=3 since slice is inclusive of first/last points
[10.46 10.47 10.48]
[10.46] missing first&last points
[10.43 10.44 10.45]
[10.47 10.48 10.49]
[11.46] missing first&last points


In [17]:
print('these should all be len=5 since slice is inclusive of first/last points')
print(ds_local.lat.sel(lat=slice(11.46,11.50)).data)
print(ds_local.lat.sel(lat=slice(11.45,11.49)).data,'missing first point')
print(ds_local.lat.sel(lat=slice(11.43,11.47)).data,'missing last point')
print(ds_local.lat.sel(lat=slice(11.41,11.45)).data,'missing first point')

these should all be len=5 since slice is inclusive of first/last points
[11.46 11.47 11.48 11.49 11.5 ]
[11.46 11.47 11.48 11.49] missing first point
[11.43 11.44 11.45 11.46] missing last point
[11.42 11.43 11.44 11.45] missing first point
