# Convert to Zarr files

In [None]:
import dask.bag as db
import dask

In [None]:
import xarray as xr
import fsspec

In [None]:
import s3fs
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
references = {}
for year in range(1992, 2021):
    references[year] = ['s3://' + path for path in (s3.glob(f'imos-data-pixeldrill/refs/{year}*'))]

In [None]:
def open_zarr(r):
    m = fsspec.get_mapper("reference://", 
                              remote_protocol='s3',
                    fo=r, remote_options={'anon':True,'skip_instance_cache':True,'use_listings_cache':False})
    ds = xr.open_zarr(m, consolidated=False)
    return ds

In [None]:
from dask_cloudprovider.aws import FargateCluster
cluster = FargateCluster(image="ghcr.io/aodn/pixeldrill:main", scheduler_timeout="60 minutes", task_role_arn="arn:aws:iam::615645230945:role/ManualDaskZarrCreation",
                         scheduler_cpu=2048, scheduler_mem=16384)

In [None]:
cluster.adapt(minimum=1,maximum=20)

In [None]:
%%time
from dask.distributed import Client
client = Client(cluster)
display(client)

In [None]:
futures = []
for ref in references.get(year):
    futures.append(client.submit(open_zarr, ref))

zarrs = [future.result() for future in futures]

zarrs = [z.drop_vars(['sea_ice_fraction', 'sea_ice_fraction_dtime_from_sst', 'sst_dtime', 'sst_count', 'sst_mean', 'sst_standard_deviation', 'wind_speed', 'wind_speed_dtime_from_sst'], errors='ignore') for z in zarrs]
ds = xr.concat(zarrs,dim='time',coords='minimal',compat='override',combine_attrs='override', fill_value='')

In [None]:
chunked = ds.chunk(chunks=64)

year=2017

for var in chunked.data_vars:
    chunked[var].encoding = {}

#output folder key has to already exist in s3 bucket
store= s3fs.S3Map(root=f's3://imos-data-pixeldrill/FOLDER/{year}/', s3=s3, check=False)

delayed = chunked.to_zarr(store, mode='w', compute=False)

In [None]:
%%time
computed = delayed.compute()