# Convert to Zarr files

In [1]:
import dask.bag as db
import dask
from dask.diagnostics import ProgressBar
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler

In [2]:
import xarray as xr
import fsspec
import glob
#xr.set_options(display_style='text');

In [3]:
from bokeh.io import output_notebook, show
output_notebook()

In [4]:
references = sorted(glob.glob('./refs/2020*.json'))
len(references)

326

In [5]:
def open_zarr(r):
    m = fsspec.get_mapper("reference://", 
                              remote_protocol='s3',
                    fo=r, remote_options={'anon':True,'skip_instance_cache':True,'use_listings_cache':False})
    ds = xr.open_zarr(m)
    return ds

In [6]:
b = db.from_sequence(references,npartitions=200)
zarrs = b.map(open_zarr)
with ResourceProfiler() as prof:
    with dask.config.set(scheduler='processes'): 
        with ProgressBar():
            zarrs = zarrs.compute()

[########################################] | 100% Completed | 31.7s


local distributed no good because runs out of memory during conversion (later on in notebook). A proper cluster would be nice to make the conversion happen faster

In [7]:
zarrs = [z.drop_vars(['sea_ice_fraction', 'sea_ice_fraction_dtime_from_sst', 'sst_dtime'], errors='ignore') for z in zarrs]

In [8]:
## %%time
ds = xr.concat(zarrs,dim='time',coords='minimal',compat='override',combine_attrs='override', fill_value='')

In [9]:
chunked = ds.chunk(chunks=64)

In [10]:
chunked.sea_surface_temperature

Unnamed: 0,Array,Chunk
Bytes,35.21 GB,1.05 MB
Shape,"(326, 4500, 6000)","(64, 64, 64)"
Count,92489 Tasks,40044 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 35.21 GB 1.05 MB Shape (326, 4500, 6000) (64, 64, 64) Count 92489 Tasks 40044 Chunks Type float32 numpy.ndarray",6000  4500  326,

Unnamed: 0,Array,Chunk
Bytes,35.21 GB,1.05 MB
Shape,"(326, 4500, 6000)","(64, 64, 64)"
Count,92489 Tasks,40044 Chunks
Type,float32,numpy.ndarray


## Convert To Zarr

In [11]:
%%time
# zarrs[0].sses_count.encoding
for var in chunked.data_vars:
    chunked[var].encoding = {}

CPU times: user 297 µs, sys: 10 µs, total: 307 µs
Wall time: 310 µs


In [12]:
%%time

import s3fs

s3 = s3fs.S3FileSystem(anon=False)

#output folder key has to already exist in s3 the bucket
store= s3fs.S3Map(root=f's3://imos-data-pixeldrill/zarr_output_folder/', s3=s3, check=False)

chunked.to_zarr(store)

CPU times: user 1h 16min 26s, sys: 7min 59s, total: 1h 24min 26s
Wall time: 2h 2min 57s


<xarray.backends.zarr.ZarrStore at 0x7f39300fed50>