In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

os.chdir("../")

# PyTorch and NumPy

In [None]:
import torch

In [None]:
batch_size = 4
N_grid = 250
d_features = 42

prev_prev_state = torch.rand((batch_size, N_grid, d_features))
prev_state = torch.rand((batch_size, N_grid, d_features))
# sample_len = 5
target_states = [torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features))]
target_states = torch.stack(target_states, dim=0)
target_states = target_states.unsqueeze(0).repeat(batch_size, 1, 1, 1)

forcings = torch.zeros(batch_size, target_states.shape[1], target_states.shape[2], 0) # (batch_size, sample_len-2, N_grid, d_forcing)
forcing = forcings[:, 2]

In [None]:
print(prev_prev_state.shape)
print(prev_state.shape)
print(target_states.shape)
print(forcings.shape)
print(forcing.shape)

In [None]:
grid_features = torch.cat(
    (
        prev_prev_state,
        prev_state,
        forcing
    ),
    dim=-1
)

In [None]:
grid_features.shape

# ERA5 Dataset

In [2]:
import xarray as xr
import zarr
import numcodecs
import numpy as np
import glob
import torch
import era5_data_proc
import os

RAW_ERA5_PATH = "/vol/bitbucket/bet20/dataset/era5/global_full"

## What does coarsen mean

In [3]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2022*.nc')
nc_files.sort()

data = xr.open_dataset(nc_files[0])
data

In [4]:
data.coarsen(latitude=8, longitude=8, boundary="pad").mean()

: 

In [None]:
data

## Verify nwp_xy.npy and time step files

In [3]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2022*.nc')
nc_files.sort()
print(nc_files[0])

/vol/bitbucket/bet20/dataset/era5/global_full/2022_01.nc


In [4]:
nc_file = nc_files[0]

data = xr.open_dataset(nc_file)
data = era5_data_proc.uk_subset(data)
time = data['time'].values[0]
print(time)
sample = data.sel(time=time)
sample

2022-01-01T00:00:00.000000000


In [51]:
# Verify Data Variable Shapes
sample["z"].shape

(8, 65, 57)

In [33]:
# load time step data
sample_dir_path = "data/era5_uk/samples/train"
sample_files = glob.glob(f'{sample_dir_path}/*.npy')
sample_files.sort()
print(sample_files[0])

time_step_data = np.load(sample_files[0])
print(time_step_data.shape)
time_step_data

data/era5_uk/samples/train/20220101000000.npy
(3705, 48)


array([[ 1.94368190e+05,  1.29415143e+05,  9.89910869e+04, ...,
        -2.14520862e-01,  3.63660158e-02, -6.21150179e-03],
       [ 1.94477398e+05,  1.29508291e+05,  9.91292029e+04, ...,
        -1.95292306e-01,  1.46701303e-01, -2.22352987e-02],
       [ 1.94576970e+05,  1.29598227e+05,  9.92705309e+04, ...,
        -1.09679448e-01,  1.72797201e-01, -2.17774760e-02],
       ...,
       [ 2.00949577e+05,  1.34872330e+05,  1.04188102e+05, ...,
         1.26557101e-01,  9.03891027e-02,  8.58108750e-02],
       [ 2.00991332e+05,  1.34923722e+05,  1.04226646e+05, ...,
         4.50646484e-02,  1.30219684e-01,  1.13280241e-01],
       [ 2.01026664e+05,  1.34968690e+05,  1.04268402e+05, ...,
         6.60753577e-03,  1.48532594e-01,  9.90877353e-02]])

In [36]:
# load nwp_xy.npy
static_dir_path = "data/era5_uk/static"
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)


torch.Size([2, 57, 65])
torch.Size([3705, 2])


In [37]:
# choose random lon/lat point and verify grid features match
test_point = 800
test_xy = grid_xy[test_point] % 360
print(test_xy)
time_step_data[test_point]

tensor([353.,  58.])


array([ 1.96879974e+05,  1.31515791e+05,  1.01294090e+05,  7.01409104e+04,
        5.40680666e+04,  4.04684621e+04,  1.34041571e+04,  2.73504548e+02,
        3.17917909e-06,  7.64446294e-06,  4.48551617e-05,  6.69250687e-04,
        1.58463388e-03,  1.60733240e-03,  5.15872149e-03,  7.26373072e-03,
        2.09010701e+02,  2.03853369e+02,  2.16995582e+02,  2.44705144e+02,
        2.55988487e+02,  2.63428099e+02,  2.76405080e+02,  2.84109065e+02,
        3.44173564e+01,  3.21066331e+01,  2.76873065e+01,  2.76681871e+01,
        2.34263983e+01,  2.09873015e+01,  1.68210971e+00, -2.64161958e+00,
        1.73105161e+01,  1.50528393e+01,  1.69709468e+01,  1.82282708e+01,
        1.61694409e+01,  1.65212469e+01,  1.89349420e+01,  5.59690566e+00,
        2.58360921e-02, -1.07897295e-02, -1.26210206e-02, -4.58082576e-01,
       -1.92545369e-01, -6.48128163e-02, -9.36556508e-02,  2.76673832e-02])

In [38]:
# find the sample from xarray
sample.sel(longitude=test_xy[0], latitude=test_xy[1])

## ERA5 Dataset Names

In [46]:
sample.data_vars

Data variables:
    z        (level, latitude, longitude) float64 237kB 1.944e+05 ... 2.252e+03
    q        (level, latitude, longitude) float64 237kB 3.179e-06 ... 0.006365
    t        (level, latitude, longitude) float64 237kB 204.4 204.4 ... 288.5
    u        (level, latitude, longitude) float64 237kB 33.17 33.23 ... -1.524
    v        (level, latitude, longitude) float64 237kB 14.32 14.18 ... 0.2036
    w        (level, latitude, longitude) float64 237kB -0.01995 ... 0.09909

In [45]:
list(sample.data_vars.keys())

['z', 'q', 't', 'u', 'v', 'w']

In [47]:
sample.level.values

array([  50,  150,  250,  400,  500,  600,  850, 1000], dtype=int32)

In [48]:
sample["units"]

KeyError: "No variable named 'units'. Variables on the dataset include ['z', 'q', 't', 'u', 'v', 'w', 'longitude', 'latitude', 'level', 'time']"

In [49]:
from neural_lam.constants import ERA5UKConstants

PARAM_SYMBOLS = list(sample.data_vars.keys())
LEVELS = list(sample.level.values)
PARAM_NAMES = [
    p + str(l)
    for p in PARAM_SYMBOLS
    for l in LEVELS
]

PARAM_UNITS = [
    sample[p].attrs["units"]
    for p in PARAM_SYMBOLS
    for _ in LEVELS
]

print(PARAM_NAMES)
print(PARAM_UNITS)

['z50', 'z150', 'z250', 'z400', 'z500', 'z600', 'z850', 'z1000', 'q50', 'q150', 'q250', 'q400', 'q500', 'q600', 'q850', 'q1000', 't50', 't150', 't250', 't400', 't500', 't600', 't850', 't1000', 'u50', 'u150', 'u250', 'u400', 'u500', 'u600', 'u850', 'u1000', 'v50', 'v150', 'v250', 'v400', 'v500', 'v600', 'v850', 'v1000', 'w50', 'w150', 'w250', 'w400', 'w500', 'w600', 'w850', 'w1000']
['m**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1']


## Grid Features

In [55]:
import torch

In [56]:
grid_features_path = "/vol/bitbucket/bet20/neural-lam/data/era5_uk/static/grid_features.pt"
grid_features = torch.load(grid_features_path)
grid_features.shape

torch.Size([3705, 5])

## Verify static variables

In [57]:
dataset_path = "/vol/bitbucket/bet20/dataset/era5/global_full/static_variables.nc"
data = xr.open_dataset(dataset_path)
data

In [None]:
# surface geopotential should be time invariant but we find that it is not.
# values changed sometime in 2022 October
# 99.7% of the values are the same however

sample1 = data.sel(time=data['time'].values[3])["z"].values
sample2 = data.sel(time=data['time'].values[4])["z"].values
np.isclose(sample1, sample2).sum() / sample1.size

0.9972260748959778

## Verify Grid Features Code

In [59]:
static_dir_path = "data/era5_uk/static"

In [61]:
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)
grid_xy

torch.Size([2, 57, 65])
torch.Size([3705, 2])


tensor([[-10.0000,  63.0000],
        [-10.0000,  62.7500],
        [-10.0000,  62.5000],
        ...,
        [  4.0000,  47.5000],
        [  4.0000,  47.2500],
        [  4.0000,  47.0000]])

In [62]:
grid_xy = np.radians(grid_xy)
grid_xy

tensor([[-0.1745,  1.0996],
        [-0.1745,  1.0952],
        [-0.1745,  1.0908],
        ...,
        [ 0.0698,  0.8290],
        [ 0.0698,  0.8247],
        [ 0.0698,  0.8203]])

In [64]:
grid_lons = grid_xy[:, 0]
grid_lats = grid_xy[:, 1]
grid_features = torch.stack(
    (
        np.cos(grid_lats), 
        np.sin(grid_lons), 
        np.cos(grid_lats)
    ), 
    dim=1
)
print(grid_features.shape)

torch.Size([3705, 3])


In [None]:
from create_grid