In [70]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

os.chdir("../")

# PyTorch and NumPy

In [None]:
import torch

In [None]:
batch_size = 4
N_grid = 250
d_features = 42

prev_prev_state = torch.rand((batch_size, N_grid, d_features))
prev_state = torch.rand((batch_size, N_grid, d_features))
# sample_len = 5
target_states = [torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features)), torch.rand((N_grid, d_features))]
target_states = torch.stack(target_states, dim=0)
target_states = target_states.unsqueeze(0).repeat(batch_size, 1, 1, 1)

forcings = torch.zeros(batch_size, target_states.shape[1], target_states.shape[2], 0) # (batch_size, sample_len-2, N_grid, d_forcing)
forcing = forcings[:, 2]

In [None]:
print(prev_prev_state.shape)
print(prev_state.shape)
print(target_states.shape)
print(forcings.shape)
print(forcing.shape)

In [None]:
grid_features = torch.cat(
    (
        prev_prev_state,
        prev_state,
        forcing
    ),
    dim=-1
)

In [None]:
grid_features.shape

# ERA5 Dataset

In [71]:
import xarray as xr
import zarr
import numcodecs
import numpy as np
import glob
import torch
import era5_data_proc
import os

RAW_ERA5_PATH = "/vol/bitbucket/bet20/dataset/era5/global_full"

## What does coarsen mean

In [72]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2022*.nc')
nc_files.sort()

data = xr.open_dataset(nc_files[0])
data

In [4]:
data.coarsen(latitude=8, longitude=8, boundary="pad").mean()

: 

In [None]:
data

## Verify nwp_xy.npy and time step files

In [73]:
nc_files = glob.glob(f'{RAW_ERA5_PATH}/2022*.nc')
nc_files.sort()
print(nc_files[0])

/vol/bitbucket/bet20/dataset/era5/global_full/2022_01.nc


In [74]:
nc_file = nc_files[0]

data = xr.open_dataset(nc_file)
data = era5_data_proc.uk_subset(data)
time = data['time'].values[1]
print(time)
sample = data.sel(time=time)
sample

2022-01-01T06:00:00.000000000


In [13]:
# Verify Data Variable Shapes
sample["z"].shape

(8, 65, 57)

In [15]:
# load time step data
sample_dir_path = "data/era5_uk/samples/train"
sample_files = glob.glob(f'{sample_dir_path}/*.npy')
sample_files.sort()
sample_file = sample_files[1]
print(sample_file)

time_step_data = np.load(sample_file)
print(time_step_data.shape)
time_step_data

data/era5_uk/samples/train/20220101060000.npy
(3705, 48)


array([[ 1.94165834e+05,  1.29582167e+05,  9.94214948e+04, ...,
        -5.98176343e-01, -1.10595093e-01, -8.40413727e-02],
       [ 1.94278254e+05,  1.29662467e+05,  9.95114308e+04, ...,
        -8.36702006e-01, -2.36496355e-01, -9.36556508e-02],
       [ 1.94390674e+05,  1.29745979e+05,  9.95949428e+04, ...,
        -9.01712840e-01, -3.60566326e-01, -9.45712964e-02],
       ...,
       [ 2.00660497e+05,  1.34753486e+05,  1.04188102e+05, ...,
         8.03170017e-02,  1.01376849e-01,  1.43038721e-01],
       [ 2.00689405e+05,  1.34804878e+05,  1.04220222e+05, ...,
        -9.87408395e-03,  2.17206010e-01,  1.41665253e-01],
       [ 2.00718313e+05,  1.34856270e+05,  1.04249130e+05, ...,
         1.34748773e-02,  1.80580188e-01,  8.53530522e-02]])

In [52]:
# load nwp_xy.npy
static_dir_path = "data/era5_uk/static"
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)


torch.Size([2, 57, 65])
torch.Size([3705, 2])


In [53]:
# choose random lon/lat point and verify grid features match
test_point = 800
# grid_xy stores lat/lon in -180/180 format
# convert to 0/360 format
test_xy = grid_xy[test_point] % 360
print(test_xy)
time_step_data[test_point]

tensor([353.,  58.])


array([ 1.96581258e+05,  1.31335919e+05,  1.00677387e+05,  6.96205666e+04,
        5.35637827e+04,  3.99545423e+04,  1.26172173e+04, -5.84099229e+02,
        3.17917909e-06,  5.03971403e-06,  4.26225198e-05,  6.14923066e-04,
        1.18126990e-03,  3.01612946e-03,  6.19280681e-03,  7.64141931e-03,
        2.06823430e+02,  2.07412073e+02,  2.17003844e+02,  2.44213576e+02,
        2.56535821e+02,  2.63308306e+02,  2.79883233e+02,  2.85579638e+02,
        3.27867396e+01,  1.97718501e+01,  9.91986559e+00,  1.01738813e+01,
        9.49377476e+00,  7.87408337e+00,  6.68594551e+00, -3.88165312e+00,
        1.72095630e+01,  2.57140908e+01,  3.25605416e+01,  2.65308926e+01,
        2.99357629e+01,  3.27165599e+01,  2.92413284e+01,  1.07608060e+01,
        2.21735099e-02, -2.07653521e-01,  1.62725100e-01,  3.84311321e-01,
        9.35938621e-02, -4.59913867e-01, -4.65407740e-01, -3.36808680e-02])

In [54]:
# find the sample from xarray
sample.sel(longitude=test_xy[0], latitude=test_xy[1])

## ERA5 Dataset Names

In [55]:
sample.data_vars

Data variables:
    z        (level, latitude, longitude) float64 237kB 1.942e+05 ... 2.181e+03
    q        (level, latitude, longitude) float64 237kB 3.179e-06 ... 0.00566
    t        (level, latitude, longitude) float64 237kB 203.5 203.6 ... 288.9
    u        (level, latitude, longitude) float64 237kB 31.84 32.04 ... -2.016
    v        (level, latitude, longitude) float64 237kB 15.69 15.48 ... 0.2005
    w        (level, latitude, longitude) float64 237kB -0.002091 ... 0.08535

In [56]:
list(sample.data_vars.keys())

['z', 'q', 't', 'u', 'v', 'w']

In [57]:
sample.level.values

array([  50,  150,  250,  400,  500,  600,  850, 1000], dtype=int32)

In [58]:
# sample["units"]

In [59]:
from neural_lam.constants import ERA5UKConstants

PARAM_SYMBOLS = list(sample.data_vars.keys())
LEVELS = list(sample.level.values)
PARAM_NAMES = [
    p + str(l)
    for p in PARAM_SYMBOLS
    for l in LEVELS
]

PARAM_UNITS = [
    sample[p].attrs["units"]
    for p in PARAM_SYMBOLS
    for _ in LEVELS
]

print(PARAM_NAMES)
print(PARAM_UNITS)

['z50', 'z150', 'z250', 'z400', 'z500', 'z600', 'z850', 'z1000', 'q50', 'q150', 'q250', 'q400', 'q500', 'q600', 'q850', 'q1000', 't50', 't150', 't250', 't400', 't500', 't600', 't850', 't1000', 'u50', 'u150', 'u250', 'u400', 'u500', 'u600', 'u850', 'u1000', 'v50', 'v150', 'v250', 'v400', 'v500', 'v600', 'v850', 'v1000', 'w50', 'w150', 'w250', 'w400', 'w500', 'w600', 'w850', 'w1000']
['m**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'm**2 s**-2', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'kg kg**-1', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'm s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1', 'Pa s**-1']


## Grid Features

In [60]:
import torch

### Simple Grid Features

In [61]:
grid_features_simple_path = "/vol/bitbucket/bet20/neural-lam/data/era5_uk/static/grid_features_simple.pt"
grid_features = torch.load(grid_features_simple_path)
grid_features.shape

torch.Size([3705, 3])

### Grid Features with Static Variables

In [62]:
grid_features_path = "/vol/bitbucket/bet20/neural-lam/data/era5_uk/static/grid_features.pt"
grid_features = torch.load(grid_features_path)
grid_features.shape

torch.Size([3705, 5])

## Verify static variables

In [63]:
dataset_path = "/vol/bitbucket/bet20/dataset/era5/global_full/static_variables.nc"
data = xr.open_dataset(dataset_path)
data

In [64]:
# surface geopotential should be time invariant but we find that it is not.
# values changed sometime in 2022 October
# 99.7% of the values are the same however

sample1 = data.sel(time=data['time'].values[3])["z"].values
sample2 = data.sel(time=data['time'].values[4])["z"].values
np.isclose(sample1, sample2).sum() / sample1.size

0.9972260748959778

### Select a spatial location from xarray and .npy to test

In [67]:
# choose random lon/lat point and verify grid features match
test_point = 421
test_xy = grid_xy[test_point] % 360
print(test_xy)

data.sel(time=data['time'].values[4], longitude=test_xy[0], latitude=test_xy[1]).to_array()

tensor([351.5000,  55.2500])


In [68]:
grid_features[test_point]

tensor([  0.5700,  -0.1478,   0.9890, -22.5237,   0.0238])

## Verify Grid Features Code

In [36]:
static_dir_path = "data/era5_uk/static"

In [39]:
grid_xy = torch.tensor(
    np.load(os.path.join(static_dir_path, "nwp_xy.npy"))
)  # (2, N_x, N_y)
print(grid_xy.shape)
grid_xy = grid_xy.reshape(2, -1).T # (N_x * N_y, 2)
print(grid_xy.shape)
grid_xy

torch.Size([2, 57, 65])
torch.Size([3705, 2])


tensor([[-10.0000,  63.0000],
        [-10.0000,  62.7500],
        [-10.0000,  62.5000],
        ...,
        [  4.0000,  47.5000],
        [  4.0000,  47.2500],
        [  4.0000,  47.0000]])

In [40]:
grid_xy = np.radians(grid_xy)
grid_xy

tensor([[-0.1745,  1.0996],
        [-0.1745,  1.0952],
        [-0.1745,  1.0908],
        ...,
        [ 0.0698,  0.8290],
        [ 0.0698,  0.8247],
        [ 0.0698,  0.8203]])

In [69]:
grid_lons = grid_xy[:, 0]
grid_lats = grid_xy[:, 1]
grid_features = torch.stack(
    (
        np.cos(grid_lats), 
        np.sin(grid_lons), 
        np.cos(grid_lons)
    ), 
    dim=1
)
print(grid_features.shape)

torch.Size([3705, 3])
