# Estimate normalization values

This script takes the training dataset stored with the *data_03_split_paths.ipynb* script and estimates the normalisation values that are also included in the *config.yaml*.

The *train_path* specifies the train dataset with which the normalisation values will be estimated.

This script efficiently works with a __dask cluster__. Its settings have to be tuned to your configuration.

In [None]:
train_path = "../data/train_data/train.zarr"

# Import and define cluster for processing

In [None]:
import glob

import xarray as xr
import numpy as np

from distributed import LocalCluster, Client

In [None]:
cluster = LocalCluster(local_directory="/tmp", dashboard_address=":4356")
client = Client(cluster)
client

In [None]:
ds = xr.open_mfdataset(train_path, engine="zarr", parallel=True)["datacube"]

## General

In [None]:
mean = ds.mean(["time", "grid"]).compute()

In [None]:
std = ds.std(["time", "grid"], ddof=1).compute()

# Dynamics

In [None]:
dynamics = ds.shift(time=-2) - ds
dynamics = dynamics[:, :6]

In [None]:
dyn_std = dynamics.std(["time", "grid"], ddof=1).compute()

In [None]:
print(", ".join([f"{v:s}" for v in mean.var_names.values]))
print(", ".join([f"{v:.4f}" for v in mean.round(4).values]))
print(", ".join([f"{v:.4f}" for v in std.round(4).values]))
print(", ".join([f"{v:.4f}" for v in dyn_std.round(4).values]))

sit, sic, sid, siu, siv, snt, tus, huss, uas, vas, rhus, pdd_month, fdd_month, pdd_year, fdd_year
0.8760, 0.4922, 0.3684, -0.0025, -0.0079, 0.0766, 268.4102, 0.0030, 0.4186, -0.3956, 81.1145, 3.7189, 6.6620, 3.7086, 6.6906
1.0540, 0.4746, 0.4307, 0.0621, 0.0653, 0.1283, 12.8402, 0.0025, 5.4348, 5.4350, 10.3725, 4.9611, 8.9729, 4.2826, 5.1771
1.3705, 0.6838, 0.5668, 0.0621, 0.0658, 0.1495, 268.7171, 0.0039, 5.4509, 5.4494, 81.7750, 6.2003, 11.1757, 5.6652, 8.4597
1.8334, 0.9147, 0.7582, 0.0831, 0.0880, 0.1999, 260.6847, 0.0021, 4.6308, 4.6288, 82.3110, 1.2776, 14.9133, 1.4096, 10.7739
0.1102, 0.0432, 0.0709, 0.0547, 0.0547, 0.0154
