In [1]:
import xeofs
import numpy as np

# Import the derived EOF
eof = xeofs.single.EOF.load("/cluster/home/haroldh/spGDMM/1_data/4_interim/EOFs/EOF_8tsuv.nc")

# Generate features
features = eof.components() * eof.scores()
features = features.sum(dim="variable")
del features.attrs['solver_kwargs']

# Selecting number of modes
- Based on variance
- Based on broken stick model

In [2]:
# Evaluate cumulative variance
explained_variance = eof.explained_variance_ratio().values
cumulative_variance = np.cumsum(explained_variance)

# Select modes up to a threshold (e.g., 70%)
threshold = 0.6
modes_60 = np.argmax(cumulative_variance >= threshold) + 1
for threshold in [0.6, 0.7, 0.8]:
    modes = np.argmax(cumulative_variance >= threshold) + 1
    print(f"Number of modes explaining {threshold*100}% variance: {modes}")
    if modes == 1:
        modes = ''
    features.attrs[f'{int(threshold*100)}% threshold'] = modes

# Broken stick model
n_modes = len(explained_variance)
broken_stick = np.array([sum(1 / (k + 1) for k in range(i, n_modes)) for i in range(n_modes)])
broken_stick_normalised = broken_stick / broken_stick.sum()

# Find significant modes
significant_modes = np.where(explained_variance > broken_stick_normalised)[0] + 1
print(f"Significant modes based on the broken stick model: {significant_modes}")

# Identify the largest continuous block (where the difference is 1)
if len(significant_modes) < 2 or significant_modes[0] != 1:
    broken_stick_modes = ''
else:
    broken_stick_modes = significant_modes[:np.argmax(np.diff(significant_modes) > 1) + 1][-1]

features.attrs['broken_stick'] = broken_stick_modes


Number of modes explaining 60.0% variance: 9
Number of modes explaining 70.0% variance: 1
Number of modes explaining 80.0% variance: 1
Significant modes based on the broken stick model: [ 1 13 14]


: 

In [None]:
features.to_netcdf("/cluster/home/haroldh/spGDMM/1_data/4_interim/EOF_14tsuv_features.nc", mode="w")

# Potential extension - average in time

In [30]:
features_averaged = features.mean(dim="time")
features_averaged

Unnamed: 0,Array,Chunk
Bytes,12.07 MiB,823.83 kiB
Shape,"(3, 555, 950)","(1, 111, 950)"
Dask graph,15 chunks in 31 graph layers,15 chunks in 31 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 12.07 MiB 823.83 kiB Shape (3, 555, 950) (1, 111, 950) Dask graph 15 chunks in 31 graph layers Data type float64 numpy.ndarray",950  555  3,

Unnamed: 0,Array,Chunk
Bytes,12.07 MiB,823.83 kiB
Shape,"(3, 555, 950)","(1, 111, 950)"
Dask graph,15 chunks in 31 graph layers,15 chunks in 31 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [1]:
import xarray as xr

features = xr.open_dataarray("/cluster/home/haroldh/spGDMM/1_data/4_interim/EOF_8tsuv_features.nc")

In [3]:
features