In [1]:
import numpy as np
import xarray as xr
import torch
from torch.utils.data import Dataset, DataLoader

from data.transform import Normalize

# Print mean and std.

def print_ms(x):
    if not isinstance(x, dict):
        x = {'x': x}
    for key, val in x.items():
        print(f'{key}: mean={val.mean()}, std={val.std()}')

In [2]:
# Craete dummy xarray data.

seq_len = 500
num_sites = 3
ds = xr.Dataset()
for x in range(3):
    # Random data with different mean (=x)
    ds[f'x{x}'] = xr.DataArray(np.random.normal(loc=x, scale=10., size=(seq_len, num_sites)).astype('float32'), dims=['time', 'site'], coords={'time': range(seq_len), 'site': [f'S{s}' for s in range(num_sites)]})

print(ds)

<xarray.Dataset>
Dimensions:  (site: 3, time: 500)
Coordinates:
  * time     (time) int64 0 1 2 3 4 5 6 7 8 ... 492 493 494 495 496 497 498 499
  * site     (site) <U2 'S0' 'S1' 'S2'
Data variables:
    x0       (time, site) float32 1.09 10.31 5.584 ... -11.04 -23.95 -2.894
    x1       (time, site) float32 19.0 0.7931 -6.42 ... -14.48 1.759 -5.892
    x2       (time, site) float32 1.05 8.91 -15.48 -4.477 ... -1.247 16.42 -30.0


In [3]:
# Split into training and test set.

train_ds = ds.sel(site='S0').isel(time=range(250))
valid_ds = ds.sel(site='S0').isel(time=range(250, 500))

In [4]:
# Create a normalizer using training data.

norm = Normalize()
norm.register_dict({
    'x0': train_ds.x0.values,
    'x1': train_ds.x1.values,
    'x2': train_ds.x2.values,
})
print(norm)

Normalize(dtype=float32)
----------------------------------------
 * x0: 0.373 (9.863 std)
 * x1: 0.866 (9.996 std)
 * x2: 1.483 (9.983 std)



In [5]:
# Build a simple pytorch dataset.

class SiteData(Dataset):
    def __init__(self, ds, dtype=np.float32):
        super().__init__()
        
        self.ds = ds
        self.dtype = dtype

    def __len__(self):
        return len(self.ds.time)

    def __getitem__(self, ind):
        d = self.ds.isel(time=ind)
        return {
            'x0': d['x0'].values.astype(self.dtype),
            'x1': d['x1'].values.astype(self.dtype),
            'x2': d['x2'].values.astype(self.dtype),
        }

data = SiteData(train_ds)

In [6]:
# Load one sample.

d = data[0]
print(d)

{'x0': array(1.0901693, dtype=float32), 'x1': array(19.000296, dtype=float32), 'x2': array(1.0497253, dtype=float32)}


In [7]:
# Normalize sample.

d_norm = norm.normalize_dict(d, return_stack=True)
print(d_norm)

[ 0.07275067  1.814245   -0.04344677]


In [15]:
# Buid a pytorch dataloader.

dl = DataLoader(data, batch_size=16)

# Get first batch.

batch = next(iter(dl))

# Access data;

x0 = batch['x0']
x1 = batch['x1']
x2 = batch['x2']

In [16]:
# Validity check.

torch.isclose(x2, norm.unnormalize('x2', norm.normalize('x2', x2)))

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True])

In [17]:
# Normalize single variable.

print_ms(norm.normalize('x2', x2))

x: mean=0.10862809419631958, std=1.0552358627319336


In [18]:
# Normalize entire batch.

print_ms(norm.normalize_dict(batch))

x0: mean=0.1362379938364029, std=1.0460177659988403
x1: mean=0.01050223782658577, std=1.21055006980896
x2: mean=0.10862809419631958, std=1.0552358627319336


In [19]:
# Normalize a subset of variables in batch.

norm.normalize_dict(batch, variables=['x0', 'x1'])

{'x0': tensor([ 0.0728, -2.0438, -1.1695,  0.7588,  0.0594, -0.6604,  0.2844,  0.0445,
          1.9129, -0.8462,  0.2230,  1.7483, -0.3065, -0.1191,  1.0643,  1.1569]),
 'x1': tensor([ 1.8142, -0.5482, -1.2990,  0.0083,  0.2917,  1.1285,  0.5940, -2.5928,
          1.0833,  0.8032,  1.7588,  0.1065, -1.3017,  0.1294, -0.8937, -0.9145])}

In [20]:
# Normalize a subset of variables in batch and stack along last dimension.

x = norm.normalize_dict(batch, variables=['x0', 'x1'], return_stack=True)

# The first dimension is the batch size, the second the two variables that we selected, x0 and x1.

x.shape

torch.Size([16, 2])