# Combine and split archive data

This script takes the yarr archives for the states and forcings, processed with *data_02_nextsim_to_zarr.ipynb* and writes them into the *train.zarr*, *valid.zarr*, and *test.zarr* archives to make them ready for training.

The *archives_path* specifies where the data to be load is stored and *store_path* specifies where the data will be stored. The reading and writing is performed with dask.

In [None]:
archives_path = "*** MISSING ***"
store_path = "../data/train_data/"

# Import and define cluster for processing

In [None]:
from itertools import product
import xarray as xr
import numpy as np
from distributed import LocalCluster, Client
from zarr.storage import DirectoryStore, ZipStore
import zarr
import shutil
from numcodecs import Blosc
from tqdm.notebook import tqdm

import cartopy.crs as ccrs

In [None]:
cluster = LocalCluster(local_directory="/tmp", n_workers=32, threads_per_worker=1, memory_limit="8 GB")
client = Client(cluster)
client

#Â Load neXtSIM data

In [None]:
train_slice = slice("1995-01-01", "2014-12-31")
val_slice = slice("2015-01-01", "2015-12-31")
test_slice = slice("2016-01-01", "2018-12-31")

In [None]:
dataset = xr.open_mfdataset(
    f"{archives_path:s}/*_creg025_v02_*.zarr",
    consolidated=True, engine="zarr",
    parallel=True,
)

In [None]:
data_vars = ["sit", "sic", "sid", "siu", "siv", "snt", "tus", "huss", "uas", "vas", "rhus", "pdd_month", "fdd_month", "pdd_year", "fdd_year"]
dataset = dataset[data_vars].to_dataarray("var_names").chunk({"time": 1, "var_names": -1, "grid": -1})
dataset = dataset.transpose("time", "var_names", "grid")
dataset = dataset.drop_vars(["latitude", "longitude"])
dataset = dataset.to_dataset(name="datacube")

In [None]:
for v in list(dataset.coords.keys()):
    if dataset.coords[v].dtype == object:
        dataset.coords[v] = dataset.coords[v].astype("unicode")
        
for v in list(dataset.variables.keys()):
    if dataset[v].dtype == object:
        dataset[v] = dataset[v].astype("unicode")

# Write datasets

In [None]:
compressor = Blosc()
encoding = {}
for data_var in dataset.data_vars:
   encoding[data_var] = {}

In [None]:
_ = dataset.sel(time=train_slice).to_zarr(
    f"{store_path:s}/train.zarr", mode="w",
    encoding=encoding, consolidated=True
)

In [None]:
_ = dataset.sel(time=val_slice).to_zarr(
    f"{store_path:s}/validation.zarr", mode="w",
    encoding=encoding, consolidated=True
)

In [None]:
dataset.sel(time=test_slice).to_zarr(
    f"{store_path:s}/test.zarr", mode="w",
    encoding=encoding, consolidated=True
)