In [1]:
from dask_jobqueue import SLURMCluster
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
from dask.distributed import Client
import dask

  from distributed.utils import tmpfile


In [2]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import zipfile
import logging
import fsspec
import ujson
from tqdm import tqdm
from glob import glob
import os

In [3]:
cluster = SLURMCluster(
                       queue="seseml",
                       memory='10GB',
                       cores=10,
                       processes=1,
                       walltime='02:30:00',
                       scheduler_options={
                                          'host': '172.22.179.3:7333', 
                                          'dashboard_address': '7999',
                                          },                        
                      )

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36063 instead


ValueError: cannot get address of non-running Server

In [None]:
cluster.scale(8)

In [None]:
cluster

In [None]:
client = Client(cluster)

In [None]:
client

In [None]:
fs = fsspec.filesystem('s3', anon=True)

urls = ['s3://' + f for f in fs.glob("s3://noaa-goes16/ABI-L2-SSTF/2020/210/*/*.nc")]

In [None]:
def gen_json(u):
    so = dict(
        mode="rb", anon=True, default_fill_cache=False, default_cache_type="none"
    )
    with fsspec.open(u, **so) as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
# Create json/ folder if it doesn't already exist
import pathlib
pathlib.Path('./jsons/').mkdir(exist_ok=True)

In [None]:
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls])

In [None]:
json_list = sorted(glob("./jsons/*.json"))

In [None]:
m_list = []
for j in tqdm(json_list):
    with open(j) as f:
        m_list.append(fsspec.get_mapper("reference://", 
                        fo=ujson.load(f),
                        remote_protocol='s3',
                        remote_options={'anon':True}))

In [None]:
%%time
ds = xr.open_mfdataset(m_list, engine='zarr', combine='nested', concat_dim='t', 
                        coords='minimal', data_vars='minimal', compat='override',
                        parallel=True)
ds

In [None]:
mzz = MultiZarrToZarr(
    json_list,
    remote_protocol="s3",
    remote_options={'anon':True},
    xarray_open_kwargs={
        "decode_cf" : False,
        "mask_and_scale" : False,
        "decode_times" : False,
        "decode_timedelta" : False,
        "use_cftime" : False,
        "decode_coords" : False
    },
    xarray_concat_args={
        'data_vars' : 'minimal',
        'coords' : 'minimal',
        'compat' : 'override',
        'join' : 'override', 
        'combine_attrs' : 'override',
        'dim' : 't'
    }
)