# Benchmark a year pixel drill

In [None]:
import json
import fsspec
import os
import fsspec_reference_maker.hdf as fshdf

In [None]:
import dask.bag as db
import dask
from dask.diagnostics import ProgressBar

In [None]:
fs = fsspec.filesystem('s3',anon=True)
all_files = fs.glob(f's3://imos-data-pixeldrill/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/*/*.nc')
len(all_files)

In [None]:
from fsspec_reference_maker.hdf import SingleHdf5ToZarr
import os
refdir = "./refs/"
os.makedirs(refdir, exist_ok = True)

In [None]:
def save_refs(s3_fn):
    s3_fn = 's3://' + s3_fn
    references = refdir + os.path.basename(s3_fn).replace('.nc','.json')
    
    if not os.path.exists(references):        
        with fsspec.open(s3_fn, 
                         anon=True, 
                         mode='rb', 
                         default_fill_cache=False, 
                         default_cache_type='none') as f:
            h5chunks = SingleHdf5ToZarr(f, s3_fn, True)
            json_str = h5chunks.translate()

        with open(references,'wt') as f:
            json.dump(json_str,f,indent=4)

    return references

In [None]:
b = db.from_sequence(all_files,npartitions=200)
references = b.map(save_refs)

In [None]:
# load profilers
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler

with dask.config.set(scheduler='threads'):
    with ProgressBar():
        with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof:
            references.compute()

In [None]:
from dask.diagnostics import visualize
visualize([prof, rprof, cprof])