In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

os.chdir("../")

# ERA5 Dataset

In [2]:
import xarray as xr
import zarr
import numcodecs
import numpy as np
import glob
import torch
import era5_data_proc
from neural_lam import constants

import os

RAW_ERA5_PATH = constants.ERA5UKConstants.RAW_ERA5_PATH

In [3]:
val_metrics = constants.ERA5UKConstants.VAR_LEADS_METRICS_WATCH
val_metric_names = [constants.ERA5UKConstants.PARAM_NAMES[i] for i in val_metrics.keys()]
val_metric_names

['z500', 't850', 'q700', 'u850', 'v850', 'w850']

## Verify nwp_xy.npy and time step files

In [4]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2022*.nc')
nc_files.sort()
nc_file = nc_files[0]
print(nc_file)

/vol/bitbucket/bet20/dataset/era5/global_full/2022_01.nc


In [5]:
# dataset = "era5_uk"
# subset = era5_data_proc.uk_subset

# dataset = "era5_uk_small"
# subset = era5_data_proc.uk_small_subset

dataset = "era5_uk_big"
subset = era5_data_proc.uk_big_subset

# dataset = "era5_uk_big_coarse"
# subset = era5_data_proc.uk_big_subset

# dataset = "era5_uk_max"
# subset = era5_data_proc.uk_max_subset

# dataset = "era5_uk_max_coarse"
# subset = era5_data_proc.uk_max_subset

data = xr.open_dataset(nc_file)
data = subset(data)
time = data['time'].values[2]
# time = np.datetime64('2022-06-01T00:00:00')
print(time)
sample = data.sel(time=time)
sample

2022-01-01T12:00:00.000000000


In [6]:
# Verify Data Variable Shapes
sample["z"].shape

(8, 81, 81)

In [7]:
# load time step data
sample_dir_path = f"data/{dataset}/samples/train"
sample_files = glob.glob(f'{sample_dir_path}/*.npy')
sample_files.sort()
sample_file = sample_files[2]
print(sample_file)

time_step_data = np.load(sample_file)
print(time_step_data.shape)
time_step_data

data/era5_uk_big/samples/train/20220101120000.npy
(6561, 48)


array([[ 1.92961335e+05,  1.28589659e+05,  9.83679591e+04, ...,
        -5.38659383e-01, -7.77642869e-01, -2.44279342e-01],
       [ 1.92980607e+05,  1.28608931e+05,  9.83872311e+04, ...,
        -5.89935533e-01, -6.17862722e-01, -1.23414131e-01],
       [ 1.92999879e+05,  1.28628203e+05,  9.84065031e+04, ...,
        -6.46705557e-01, -4.09095539e-01, -2.40665898e-02],
       ...,
       [ 2.01000968e+05,  1.35142138e+05,  1.04473970e+05, ...,
         8.43882685e-03, -2.72713492e-02, -3.82590957e-02],
       [ 2.00984908e+05,  1.35116442e+05,  1.04457910e+05, ...,
        -6.52706391e-02,  2.30891555e-02,  5.28476355e-02],
       [ 2.00972060e+05,  1.35097170e+05,  1.04445062e+05, ...,
        -2.58471848e-01,  3.64167119e-01,  4.24599725e-01]])

In [8]:
# load nwp_xy.npy
static_dir_path = f"data/{dataset}/static"
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_y, N_x)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_y * N_x, 2)
print(grid_xy.shape)


torch.Size([2, 81, 81])
torch.Size([6561, 2])


In [9]:
# choose random lon/lat point and verify grid features match
test_point = 201
# grid_xy stores lat/lon in -180/180 format
# convert to 0/360 format
test_xy = grid_xy[test_point] % 360
print(test_xy)
time_step_data[test_point]

tensor([356.7500,  64.5000])


array([ 1.93841422e+05,  1.29472959e+05,  9.92416229e+04,  6.84128549e+04,
        5.25873350e+04,  3.92029345e+04,  1.28099373e+04,  6.47246025e+01,
        3.17917909e-06,  6.52814198e-06,  3.36919521e-05,  4.97337258e-04,
        1.17308355e-03,  1.81980549e-03,  2.26261281e-03,  4.28613060e-03,
        2.02711196e+02,  2.02304310e+02,  2.15295749e+02,  2.41512017e+02,
        2.52142691e+02,  2.58528943e+02,  2.67817099e+02,  2.78222642e+02,
        3.51302392e+01,  2.22409918e+01,  2.47674919e+01,  1.88486534e+01,
        1.35033986e+01,  6.61766172e+00, -1.39603398e+01, -1.10104803e+01,
        1.14797139e+01,  1.05864326e+01,  1.56646758e+01,  1.46673822e+01,
        1.24984217e+01,  1.64478265e+01,  1.26054931e+01,  6.27910339e+00,
        9.35447239e-03, -6.21150179e-03,  1.17400646e-01, -2.32375950e-01,
       -4.92877106e-01, -6.75090569e-01, -1.98039242e-01, -7.30536262e-02])

In [10]:
# find the sample from xarray
sample.sel(longitude=test_xy[0], latitude=test_xy[1])

## ERA5 Dataset Names

In [13]:
sample.data_vars

Data variables:
    z        (level, latitude, longitude) float64 420kB 1.93e+05 ... 2.069e+03
    q        (level, latitude, longitude) float64 420kB 3.179e-06 ... 0.004964
    t        (level, latitude, longitude) float64 420kB 201.4 201.5 ... 297.0
    u        (level, latitude, longitude) float64 420kB 29.75 29.83 ... 1.054
    v        (level, latitude, longitude) float64 420kB 12.92 12.8 ... -0.2492
    w        (level, latitude, longitude) float64 420kB 0.006608 ... 0.4246

In [14]:
list(sample.data_vars.keys())

['z', 'q', 't', 'u', 'v', 'w']

In [15]:
sample.level.values

array([  50,  150,  250,  400,  500,  600,  850, 1000], dtype=int32)

In [16]:
from neural_lam.constants import ERA5UKConstants

PARAM_SYMBOLS = list(sample.data_vars.keys())
LEVELS = list(sample.level.values)
PARAM_NAMES = [
    p + str(l)
    for p in PARAM_SYMBOLS
    for l in LEVELS
]

PARAM_UNITS = [
    sample[p].attrs["units"]
    for p in PARAM_SYMBOLS
    for _ in LEVELS
]

print(PARAM_NAMES)
print(PARAM_UNITS)

['z50', 'z150', 'z250', 'z400', 'z500', 'z600', 'z850', 'z1000', 'q50', 'q150', 'q250', 'q400', 'q500', 'q600', 'q850', 'q1000', 't50', 't150', 't250', 't400', 't500', 't600', 't850', 't1000', 'u50', 'u150', 'u250', 'u400', 'u500', 'u600', 'u850', 'u1000', 'v50', 'v150', 'v250', 'v400', 'v500', 'v600', 'v850', 'v1000', 'w50', 'w150', 'w250', 'w400', 'w500', 'w600', 'w850', 'w1000']
['m**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1']


## Grid Features

In [17]:
import torch

### Simple Grid Features

In [18]:
dataset = "era5_uk_big"

In [19]:
grid_features_simple_path = f"./data/{dataset}/static/grid_features_simple.pt"
grid_features = torch.load(grid_features_simple_path)
grid_features.shape

torch.Size([6561, 3])

### Grid Features with Static Variables

In [20]:
grid_features_path = f"./data/{dataset}/static/grid_features.pt"
grid_features = torch.load(grid_features_path)
grid_features.shape

torch.Size([6561, 5])

## Verify static variables

In [21]:
dataset_path = f"{RAW_ERA5_PATH}/static_variables.nc"
data = xr.open_dataset(dataset_path)
data

In [22]:
# surface geopotential should be time invariant but we find that it is not.
# values changed sometime in 2022 October
# 99.7% of the values are the same however

sample1 = data.sel(time=data['time'].values[3])["z"].values
sample2 = data.sel(time=data['time'].values[4])["z"].values
np.isclose(sample1, sample2).sum() / sample1.size

0.9972260748959778

### Select a spatial location from xarray and .npy to test

In [23]:
# choose random lon/lat point and verify grid features match
test_point = 100
test_xy = grid_xy[test_point] % 360
print(test_xy)

data.sel(time=data['time'].values[0], longitude=test_xy[0], latitude=test_xy[1]).to_array()

tensor([351.7500,  64.7500])


In [24]:
grid_features[test_point]

tensor([ 0.4266, -0.1435,  0.9897, -5.4628,  0.0000])

## Verify Grid Features Code

In [None]:
static_dir_path = "data/era5_uk/static"

In [None]:
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)
grid_xy

torch.Size([2, 57, 65])
torch.Size([3705, 2])


tensor([[-10.0000,  63.0000],
        [-10.0000,  62.7500],
        [-10.0000,  62.5000],
        ...,
        [  4.0000,  47.5000],
        [  4.0000,  47.2500],
        [  4.0000,  47.0000]])

In [None]:
grid_xy = np.radians(grid_xy)
grid_xy

tensor([[-0.1745,  1.0996],
        [-0.1745,  1.0952],
        [-0.1745,  1.0908],
        ...,
        [ 0.0698,  0.8290],
        [ 0.0698,  0.8247],
        [ 0.0698,  0.8203]])

In [None]:
grid_lons = grid_xy[:, 0]
grid_lats = grid_xy[:, 1]
grid_features = torch.stack(
    (
        np.cos(grid_lats), 
        np.sin(grid_lons), 
        np.cos(grid_lons)
    ), 
    dim=1
)
print(grid_features.shape)

torch.Size([3705, 3])


In [None]:
# PyTorch and NumPy
import torch
batch_size = 4
N_grid = 250
d_features = 42

prev_prev_state = torch.rand((batch_size, N_grid, d_features))
prev_state = torch.rand((batch_size, N_grid, d_features))
# sample_len = 5
target_states = [torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features))]
target_states = torch.stack(target_states, dim=0)
target_states = target_states.unsqueeze(0).repeat(batch_size, 1, 1, 1)

forcings = torch.zeros(batch_size, target_states.shape[1], target_states.shape[2], 0) # (batch_size, sample_len-2, N_grid, d_forcing)
forcing = forcings[:, 2]
print(prev_prev_state.shape)
print(prev_state.shape)
print(target_states.shape)
print(forcings.shape)
print(forcing.shape)
grid_features = torch.cat(
    (
        prev_prev_state,
        prev_state,
        forcing
    ),
    dim=-1
)
grid_features.shape