In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

os.chdir("../")

# ERA5 Dataset

In [2]:
import xarray as xr
import zarr
import numcodecs
import numpy as np
import glob
import torch
import era5_data_proc
from neural_lam import constants

import os

RAW_ERA5_PATH = constants.ERA5UKConstants.RAW_ERA5_PATH

In [3]:
val_metrics = constants.ERA5UKConstants.VAR_LEADS_METRICS_WATCH
val_metric_names = [constants.ERA5UKConstants.PARAM_NAMES[i] for i in val_metrics.keys()]
val_metric_names

['z500', 't850', 'q700', 'u850', 'v850', 'w850']

## Verify nwp_xy.npy and time step files

In [74]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2021*.nc')
nc_files.sort()
nc_file = nc_files[0]
print(nc_file)

/work/ec249/ec249/bet20/dataset/era5/global_full/2021_01.nc


In [82]:
# dataset = "era5_uk"
# subset = era5_data_proc.uk_subset

# dataset = "era5_uk_small"
# subset = era5_data_proc.uk_small_subset

# dataset = "era5_uk_big"
# subset = era5_data_proc.uk_big_subset

# dataset = "era5_uk_big_coarse"
# subset = era5_data_proc.uk_big_subset

dataset = "era5_uk_max"
subset = era5_data_proc.uk_max_subset

dataset = "era5_uk_max_coarse"
subset = era5_data_proc.uk_max_subset

data = xr.open_dataset(nc_file)
data = subset(data)
time = data['time'].values[2]
# time = np.datetime64('2022-06-01T00:00:00')
print(time)
sample = data.sel(time=time)
sample

2021-01-01T12:00:00.000000000


In [83]:
# Verify Data Variable Shapes
sample["z"].shape

(8, 137, 137)

In [84]:
# load time step data
sample_dir_path = f"data/{dataset}/samples/train"
sample_files = glob.glob(f'{sample_dir_path}/*.npy')
sample_files.sort()
sample_file = sample_files[2]
print(sample_file)

time_step_data = np.load(sample_file)
print(time_step_data.shape)
time_step_data

data/era5_uk_max_coarse/samples/train/20210101120000.npy
(2116, 48)


array([[ 1.93014344e+05,  1.29171798e+05,  9.91444858e+04, ...,
         1.31227838e-01,  6.61468965e-02,  3.60087421e-02],
       [ 1.93014344e+05,  1.29200689e+05,  9.91701666e+04, ...,
         1.50883156e-01, -2.07732877e-02, -2.81986301e-02],
       [ 1.93007924e+05,  1.29226369e+05,  9.91830070e+04, ...,
         1.41273890e-01, -5.57160753e-02, -2.16468574e-02],
       ...,
       [ 2.00076572e+05,  1.32170035e+05,  1.00120357e+05, ...,
         3.67091655e-01,  1.21181787e-01, -3.56239725e-02],
       [ 2.00076572e+05,  1.32221396e+05,  1.00197400e+05, ...,
        -1.85893635e-02,  5.55345923e-01,  3.60539882e-01],
       [ 2.00086203e+05,  1.32308069e+05,  1.00268022e+05, ...,
        -1.99153939e-03,  1.29043914e-01,  2.16400883e-01]])

In [85]:
# load nwp_xy.npy
static_dir_path = f"data/{dataset}/static"
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_y, N_x)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_y * N_x, 2)
print(grid_xy.shape)


torch.Size([2, 46, 46])
torch.Size([2116, 2])


In [86]:
# choose random lon/lat point and verify grid features match
test_point = 201
# grid_xy stores lat/lon in -180/180 format
# convert to 0/360 format
test_xy = grid_xy[test_point] % 360
print(test_xy)
time_step_data[test_point]

tensor([352.7500,  69.0000])


array([ 1.93338564e+05,  1.29242420e+05,  9.83226992e+04,  6.80482094e+04,
        5.26782320e+04,  3.97350931e+04,  1.38969669e+04,  1.43855367e+03,
        2.91104215e-06,  2.91104215e-06,  1.16982119e-05,  2.20830489e-05,
        5.04416421e-05,  1.44304592e-04,  1.86938578e-03,  1.89494845e-03,
        1.96339286e+02,  2.09881939e+02,  2.14161984e+02,  2.34999409e+02,
        2.44173464e+02,  2.51232095e+02,  2.66179783e+02,  2.70926206e+02,
        2.01078242e+01,  2.13689685e+00, -5.17501693e-02, -2.87563865e+00,
       -3.03044539e+00, -3.84985348e+00, -9.21870815e-01,  3.33264547e+00,
       -1.52565728e+01, -1.36584708e+01, -3.76897941e+01, -3.28329773e+01,
       -1.81103264e+01, -8.30156758e+00, -3.87417267e+00, -5.59457843e+00,
        7.18094236e-03, -1.42215151e-02,  2.47849392e-01,  1.19434648e-01,
        2.20331947e-01,  3.42631703e-01,  1.12009305e-01, -3.30189393e-03])

In [87]:
# find the sample from xarray
sample.sel(longitude=test_xy[0], latitude=test_xy[1])

## ERA5 Dataset Names

In [38]:
sample.data_vars

Data variables:
    z        (level, latitude, longitude) float64 420kB 1.995e+05 ... 2.45e+03
    q        (level, latitude, longitude) float64 420kB 2.885e-06 ... 0.003213
    t        (level, latitude, longitude) float64 420kB 214.6 214.5 ... 285.3
    u        (level, latitude, longitude) float64 420kB 15.08 14.92 ... -0.9491
    v        (level, latitude, longitude) float64 420kB 4.846 5.077 ... 0.01943
    w        (level, latitude, longitude) float64 420kB -0.03721 ... -0.3348

In [39]:
list(sample.data_vars.keys())

['z', 'q', 't', 'u', 'v', 'w']

In [40]:
sample.level.values

array([  50,  150,  250,  400,  500,  600,  850, 1000], dtype=int32)

In [18]:
from neural_lam.constants import ERA5UKConstants

PARAM_SYMBOLS = list(sample.data_vars.keys())
LEVELS = list(sample.level.values)
PARAM_NAMES = [
    p + str(l)
    for p in PARAM_SYMBOLS
    for l in LEVELS
]

PARAM_UNITS = [
    sample[p].attrs["units"]
    for p in PARAM_SYMBOLS
    for _ in LEVELS
]

print(PARAM_NAMES)
print(PARAM_UNITS)

['z50', 'z150', 'z250', 'z400', 'z500', 'z600', 'z850', 'z1000', 'q50', 'q150', 'q250', 'q400', 'q500', 'q600', 'q850', 'q1000', 't50', 't150', 't250', 't400', 't500', 't600', 't850', 't1000', 'u50', 'u150', 'u250', 'u400', 'u500', 'u600', 'u850', 'u1000', 'v50', 'v150', 'v250', 'v400', 'v500', 'v600', 'v850', 'v1000', 'w50', 'w150', 'w250', 'w400', 'w500', 'w600', 'w850', 'w1000']
['m**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1']


## Grid Features

In [3]:
import torch

### Simple Grid Features

In [13]:
dataset = "era5_uk_big"

In [14]:
grid_features_simple_path = f"./data/{dataset}/static/grid_features_simple.pt"
grid_features = torch.load(grid_features_simple_path)
grid_features.shape

torch.Size([6561, 3])

### Grid Features with Static Variables

In [15]:
grid_features_path = f"./data/{dataset}/static/grid_features.pt"
grid_features = torch.load(grid_features_path)
grid_features.shape

torch.Size([6561, 5])

## Verify static variables

In [22]:
dataset_path = f"{RAW_ERA5_PATH}/static_variables.nc"
data = xr.open_dataset(dataset_path)
data

In [23]:
# surface geopotential should be time invariant but we find that it is not.
# values changed sometime in 2022 October
# 99.7% of the values are the same however

sample1 = data.sel(time=data['time'].values[3])["z"].values
sample2 = data.sel(time=data['time'].values[4])["z"].values
np.isclose(sample1, sample2).sum() / sample1.size

0.9972260748959778

### Select a spatial location from xarray and .npy to test

In [24]:
# choose random lon/lat point and verify grid features match
test_point = 100
test_xy = grid_xy[test_point] % 360
print(test_xy)

data.sel(time=data['time'].values[0], longitude=test_xy[0], latitude=test_xy[1]).to_array()

tensor([357.0000,  60.5000])


In [25]:
grid_features[test_point]

tensor([ 0.4924, -0.0523,  0.9986,  3.5166,  0.0000])

## Verify Grid Features Code

In [None]:
static_dir_path = "data/era5_uk/static"

In [None]:
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)
grid_xy

torch.Size([2, 57, 65])
torch.Size([3705, 2])


tensor([[-10.0000,  63.0000],
        [-10.0000,  62.7500],
        [-10.0000,  62.5000],
        ...,
        [  4.0000,  47.5000],
        [  4.0000,  47.2500],
        [  4.0000,  47.0000]])

In [None]:
grid_xy = np.radians(grid_xy)
grid_xy

tensor([[-0.1745,  1.0996],
        [-0.1745,  1.0952],
        [-0.1745,  1.0908],
        ...,
        [ 0.0698,  0.8290],
        [ 0.0698,  0.8247],
        [ 0.0698,  0.8203]])

In [None]:
grid_lons = grid_xy[:, 0]
grid_lats = grid_xy[:, 1]
grid_features = torch.stack(
    (
        np.cos(grid_lats), 
        np.sin(grid_lons), 
        np.cos(grid_lons)
    ), 
    dim=1
)
print(grid_features.shape)

torch.Size([3705, 3])


In [None]:
# PyTorch and NumPy
import torch
batch_size = 4
N_grid = 250
d_features = 42

prev_prev_state = torch.rand((batch_size, N_grid, d_features))
prev_state = torch.rand((batch_size, N_grid, d_features))
# sample_len = 5
target_states = [torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features))]
target_states = torch.stack(target_states, dim=0)
target_states = target_states.unsqueeze(0).repeat(batch_size, 1, 1, 1)

forcings = torch.zeros(batch_size, target_states.shape[1], target_states.shape[2], 0) # (batch_size, sample_len-2, N_grid, d_forcing)
forcing = forcings[:, 2]
print(prev_prev_state.shape)
print(prev_state.shape)
print(target_states.shape)
print(forcings.shape)
print(forcing.shape)
grid_features = torch.cat(
    (
        prev_prev_state,
        prev_state,
        forcing
    ),
    dim=-1
)
grid_features.shape