### NAO correlations with energy variables ###

Exploring how well the NAO correlates with energy variables on seasonal to decadal timescales during the winter (ONDJFM, DJFM, or DJF). Using the following datasets:

* CLEARHEADS - ERA5-derived energy time series, includes offshore wind in EEZs and Heating Degree Days.
* ERA5 - reanalysis product for deriving the NAO indices at different timescales.
* ENTSO-E - shorter observed time series of capacity factors and other energy variables. For ground truthing the CLEARHEADS data above.

In [None]:
# -*- coding: utf-8 -*-
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Import local modules
import sys
import os
import glob

# Import third-party modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import iris
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cdsapi
import xesmf as xe
from datetime import datetime
from tqdm import tqdm
from scipy.stats import pearsonr

In [None]:
sys.path.append("/home/users/benhutch/energy-met-corr")
import dictionaries_em as dicts

sys.path.append("/home/users/benhutch/skill-maps/python")
import functions as fnc

### Downloading ERA5 data ###

For calculating the NAO index, we want to query the CDS API for ERA5 data:

* From 1950-2023
* For ONDJFM
* Monthly-means

*Note - this data should be regridded before comparison with the CLEARHEADS/ENTSO-E data*

In [None]:
# Set up a new client
c = cdsapi.Client()

In [None]:
# Set up the dictionary for the ERA5 request
era5_request_dict = {
    'variable': 'mean_sea_level_pressure',
    'product_type': 'monthly_averaged_reanalysis',
    'year': [x for x in map(str, range(1950, 2023))],
    'month': [1, 2, 3, 10, 11, 12],
    'format': 'netcdf',
    'time': '00:00'
}

# Print the request dictionary
print(era5_request_dict)

In [None]:
# # Set up the target directory
# target_dir = '/gws/nopw/j04/canari/users/benhutch/ERA5'

# # Assert that the target directory exists
# assert os.path.exists(target_dir)

# # Assert that the target directory is not empty
# assert len(os.listdir(target_dir)) > 0

# # Set up the target file
# target_file = os.path.join(target_dir, 'era5_mslp_monthly_1950_2022_ONDJFM.nc')

# # Print the target file
# print(target_file)

# # If the target file does not exist, download the data
# if not os.path.exists(target_file):
#     c.retrieve(
#         'reanalysis-era5-single-levels',
#         era5_request_dict,
#         target_file)
# else:
#     print('The target file already exists: {}'.format(target_file))

Now we want to plot the observed spatial correlations between the NAO and 10m wind speeds and precipitation.

In [None]:
# Write a function to calculate the stats
def calc_nao_spatial_corr(season: str,
                          forecast_range: str,
                          start_year: int,
                          end_year: int,
                          corr_var: str = "si10",
                          corr_var_obs_file: str = "/home/users/benhutch/ERA5/adaptor.mars.internal-1691509121.3261805-29348-4-3a487c76-fc7b-421f-b5be-7436e2eb78d7.nc",
                          nao_obs_var: str = "msl",
                          nao_obs_file: str = "/home/users/benhutch/ERA5/adaptor.mars.internal-1691509121.3261805-29348-4-3a487c76-fc7b-421f-b5be-7436e2eb78d7.nc",
                          nao_n_grid: dict = dicts.iceland_grid,
                          nao_s_grid: dict = dicts.azores_grid,
                          sig_threshold: float = 0.05,
):
    """
    Calculates the spatial correlations between the NAO index (winter default) 
    and the variable to correlate for the observations.

    Args:
    -----

    season: str
        The season to calculate the correlation for.

    forecast_range: str
        The forecast range to calculate the correlation for.

    start_year: int
        The start year to calculate the correlation for.

    end_year: int
        The end year to calculate the correlation for.

    corr_var: str
        The variable to correlate with the NAO index.

    corr_var_obs_file: str
        The file containing the observations of the variable to correlate.

    nao_obs_var: str
        The variable to use for the NAO index.

    nao_obs_file: str
        The file containing the observations of the NAO index.

    nao_n_grid: dict
        The dictionary containing the grid information for the northern node
        of the winter NAO index.

    nao_s_grid: dict
        The dictionary containing the grid information for the southern node
        of the winter NAO index.

    sig_threshold: float
        The significance threshold for the correlation.

    Returns:
    --------

    stats_dict: dict
        The dictionary containing the correlation statistics.
    """

    # Set up the mdi
    mdi = -9999.0

    # Form the dictionary
    stats_dict = {
        "nao": [],
        "corr_var_ts": [],
        "corr_var": corr_var,
        "corr_nao_var": [],
        "corr_nao_var_pval": [],
        "init_years": [],
        "valid_years": [],
        "lats": [],
        "lons": [],
        "season": season,
        "forecast_range": forecast_range,
        "start_year": start_year,
        "end_year": end_year,
        "sig_threshold": sig_threshold
    }

    # Set up the init years
    stats_dict["init_years"] = np.arange(start_year, end_year + 1)

    # Assert that the season is a winter season
    assert season in ["DJF", "ONDJFM", "DJFM"], "The season must be a winter season."

    # Assert that the forecast range is a valid forecast range
    assert "-" in forecast_range, "The forecast range must be a valid forecast range."

    # Set up the lons and lats for the south grid
    s_lon1, s_lon2 = nao_s_grid["lon1"], nao_s_grid["lon2"]
    s_lat1, s_lat2 = nao_s_grid["lat1"], nao_s_grid["lat2"]

    # and for the north grid
    n_lon1, n_lon2 = nao_n_grid["lon1"], nao_n_grid["lon2"]
    n_lat1, n_lat2 = nao_n_grid["lat1"], nao_n_grid["lat2"]

    # First check that the file exists for psl
    assert os.path.exists(corr_var_obs_file), "The file for the variable to correlate does not exist."

    # Check that the file exists for the NAO index
    assert os.path.exists(nao_obs_file), "The file for the NAO index does not exist."

    # Load the observations for psl
    psl = fnc.load_obs(variable=nao_obs_var,
                   regrid_obs_path=nao_obs_file)
    
    # Load the observations for the matching var
    corr_var_field = fnc.load_obs(variable=corr_var,
                        regrid_obs_path=corr_var_obs_file)
    
    # extract the months
    months = dicts.season_month_map[season]

    # Set up an iris constraint for the start and end years
    start_date = datetime(int(start_year), months[0], 1)
    end_date = datetime(int(end_year), months[-1], 31)

    # Form the constraint
    time_constraint = iris.Constraint(time=lambda cell: start_date <= cell.point <= end_date)

    # Apply the constraint
    psl = psl.extract(time_constraint)

    # Apply the constraint
    corr_var_field = corr_var_field.extract(time_constraint)

    # Set up the constrain for months
    month_constraint = iris.Constraint(time=lambda cell: cell.point.month in months)

    # Apply the constraint
    psl = psl.extract(month_constraint)
    
    # Apply the constraint
    corr_var_field = corr_var_field.extract(month_constraint)
    
    # Calculate the climatology by collapsing the time dimension
    psl_clim = psl.collapsed("time", iris.analysis.MEAN)

    # Calculate the climatology by collapsing the time dimension
    corr_var_clim = corr_var_field.collapsed("time", iris.analysis.MEAN)

    # Calculate the anomalies
    psl_anom = psl - psl_clim

    # Calculate the anomalies
    corr_var_anom = corr_var_field - corr_var_clim

    # Calculate the annual mean anoms
    psl_anom = fnc.calculate_annual_mean_anomalies(obs_anomalies=psl_anom,
                                               season=season)
    
    # Calculate the annual mean anoms
    corr_var_anom = fnc.calculate_annual_mean_anomalies(obs_anomalies=corr_var_anom,
                                               season=season)
    
    # # Print psl anom at the first time step
    # print("psl anom at the first time step: ", psl_anom.isel(time=0).values)
    
    # # print corr_var anom at the first time step
    # print("corr_var anom at the first time step: ", corr_var_anom.isel(time=0).values)

    # Select the forecast range
    psl_anom = fnc.select_forecast_range(obs_anomalies_annual=psl_anom,
                                        forecast_range=forecast_range)
    
    # Select the forecast range
    corr_var_anom = fnc.select_forecast_range(obs_anomalies_annual=corr_var_anom,
                                        forecast_range=forecast_range)
    
    # # Loop over the years in psl_anom
    # for year in psl_anom.time.dt.year.values:
    #     # Extract the data for the year
    #     psl_anom_year = psl_anom.sel(time=f"{year}")

    #     # If there are any NaNs, log it
    #     if np.isnan(psl_anom_year).any():
    #         print("There are NaNs in the psl_anom_year for year: ", year)
    #         # if all values are NaN, then continue
    #         if np.all(np.isnan(psl_anom_year)):
    #             print("All values are NaN for year: ", year)
    #             print("Removing the year: ", year)
    #             # Remove the year from the psl_anom
    #             psl_anom = psl_anom.sel(time=psl_anom.time.dt.year != year)

    # Loop over the first 10 years and last 10 years in psl_anom
    for year in corr_var_anom.time.dt.year.values[:10]:
        # Extract the data for the year
        corr_var_anom_year = corr_var_anom.sel(time=f"{year}")

        # If there are any NaNs, log it
        if np.isnan(corr_var_anom_year).any():
            print("There are NaNs in the corr_var_anom_year for year: ", year)
            # if all values are NaN, then continue
            if np.all(np.isnan(corr_var_anom_year)):
                print("All values are NaN for year: ", year)
                print("Removing the year: ", year)
                # Remove the year from the psl_anom
                corr_var_anom = corr_var_anom.sel(time=corr_var_anom.time.dt.year != year)

    # Loop over the last 10 years in psl_anom
    for year in corr_var_anom.time.dt.year.values[-10:]:
        # Extract the data for the year
        corr_var_anom_year = corr_var_anom.sel(time=f"{year}")

        # If there are any NaNs, log it
        if np.isnan(corr_var_anom_year).any():
            print("There are NaNs in the corr_var_anom_year for year: ", year)
            # if all values are NaN, then continue
            if np.all(np.isnan(corr_var_anom_year)):
                print("All values are NaN for year: ", year)
                print("Removing the year: ", year)
                # Remove the year from the psl_anom
                corr_var_anom = corr_var_anom.sel(time=corr_var_anom.time.dt.year != year)
    
    # print the type of psl_anom
    print("type of psl_anom: ", type(psl_anom))

    # print the type of corr_var_anom
    print("type of corr_var_anom: ", type(corr_var_anom))

    # Extract the years for psl anom
    # years_psl = psl_anom.time.dt.year.values
    years_corr_var = corr_var_anom.time.dt.year.values

    # # Set the time axis for psl_anom to the years
    # psl_anom = psl_anom.assign_coords(time=years_psl)

    # Set the time axis for corr_var_anom to the years
    corr_var_anom = corr_var_anom.assign_coords(time=years_corr_var)

    # Lat goes from 90 to -90
    # Lon goes from 0 to 360

    # If s_lat1 is smaller than s_lat2, then we need to switch them
    if s_lat1 < s_lat2:
        s_lat1, s_lat2 = s_lat2, s_lat1

    # If n_lat1 is smaller than n_lat2, then we need to switch them
    if n_lat1 < n_lat2:
        n_lat1, n_lat2 = n_lat2, n_lat1

    # Asert that the lons are within the range of 0 to 360
    assert 0 <= s_lon1 <= 360, "The southern longitude is not within the range of 0 to 360."

    # Asert that the lons are within the range of 0 to 360
    assert 0 <= s_lon2 <= 360, "The southern longitude is not within the range of 0 to 360."

    # Asert that the lons are within the range of 0 to 360
    assert 0 <= n_lon1 <= 360, "The northern longitude is not within the range of 0 to 360."

    # Asert that the lons are within the range of 0 to 360
    assert 0 <= n_lon2 <= 360, "The northern longitude is not within the range of 0 to 360."

    # Constraint the psl_anom to the south grid
    psl_anom_s = psl_anom.sel(longitude=slice(s_lon1, s_lon2),
                               latitude=slice(s_lat1, s_lat2)
                               ).mean(dim=["latitude", "longitude"])

    # Constraint the psl_anom to the north grid
    psl_anom_n = psl_anom.sel(longitude=slice(n_lon1, n_lon2),
                               latitude=slice(n_lat1, n_lat2)
                               ).mean(dim=["latitude", "longitude"])
    
    # Calculate the nao index azores - iceland
    nao_index = psl_anom_s - psl_anom_n

    # Loop over the first 10 years and last 10 years in nao_index
    for year in nao_index.time.dt.year.values:
        # Extract the data for the year
        nao_index_year = nao_index.sel(time=f"{year}")

        # If there are any NaNs, log it
        if np.isnan(nao_index_year).any():
            print("There are NaNs in the nao_index_year for year: ", year)
            # if all values are NaN, then continue
            if np.all(np.isnan(nao_index_year)):
                print("All values are NaN for year: ", year)
                print("Removing the year: ", year)
                # Remove the year from the nao_index
                nao_index = nao_index.sel(time=nao_index.time.dt.year != year)

    # Extract the years for nao_index
    years_nao = nao_index.time.dt.year.values

    # Extract the years for corr_var_anom
    years_corr_var = corr_var_anom.time.values

    # Assert that the years are the same
    assert np.array_equal(years_nao, years_corr_var), "The years for the NAO index and the variable to correlate are not the same."

    # Set the valid years
    stats_dict["valid_years"] = years_nao

    # extract tyhe lats and lons
    lats = corr_var_anom.latitude.values

    # extract the lons
    lons = corr_var_anom.longitude.values

    # Store the lats and lons in the dictionary
    stats_dict["lats"] = lats
    stats_dict["lons"] = lons

    # Extract the values for the NAO index
    nao_index_values = nao_index.values

    # Extract the values for the variable to correlate
    corr_var_anom_values = corr_var_anom.values

    # Store the nao index values in the dictionary
    stats_dict["nao"] = nao_index_values

    # Store the variable to correlate values in the dictionary
    stats_dict["corr_var_ts"] = corr_var_anom_values

    # Create an empty array with the correct shape for the correlation
    corr_nao_var = np.empty((len(lats), len(lons)))

    # Create an empty array with the correct shape for the p-value
    corr_nao_var_pval = np.empty((len(lats), len(lons)))

    # Loop over the lats
    for i, lat in tqdm(enumerate(lats)):
        # Loop over the lons
        for j, lon in enumerate(lons):
            # Extract the values for the variable to correlate
            corr_var_anom_values = corr_var_anom.values[:, i, j]

            # Calculate the correlation
            corr, pval = pearsonr(nao_index_values, corr_var_anom_values)

            # Store the correlation in the array
            corr_nao_var[i, j] = corr

            # Store the p-value in the array
            corr_nao_var_pval[i, j] = pval

    # Store the correlation in the dictionary
    stats_dict["corr_nao_var"] = corr_nao_var

    # Store the p-value in the dictionary
    stats_dict["corr_nao_var_pval"] = corr_nao_var_pval

    # return none
    return stats_dict

In [7]:
# Test this function
stats_dict = calc_nao_spatial_corr(
    season="ONDJFM",
    forecast_range="2-9",
    start_year=1960,
    end_year=2014,
    corr_var="si10",
)

KeyboardInterrupt: 

In [None]:
nao_index_values, corr_var_anom_values = test

In [None]:
print("nao_index_values: ", nao_index_values.shape)
print("corr_var_anom_values: ", corr_var_anom_values.shape)

In [None]:
from tqdm import tqdm
from scipy.stats import pearsonr

# Create an empty array with the correct shape
corr_array = np.empty([corr_var_anom_values.shape[1], # lat
                          corr_var_anom_values.shape[2]]) # lon

# Same for the p-values
pval_array = np.empty([corr_var_anom_values.shape[1], # lat
                          corr_var_anom_values.shape[2]]) # lon


# Loop over the lats
for lat in tqdm(range(corr_var_anom_values.shape[1])):
    # Loop over the lons
    for lon in range(corr_var_anom_values.shape[2]):
         # Extract the corr_var_anom_values for the lat and lon
         corr_var_anom_values_lat_lon = corr_var_anom_values[:, lat, lon]

         # Calculate the correlation
         corr, pval = pearsonr(nao_index_values, corr_var_anom_values_lat_lon)

         # Assign the correlation to the array
         corr_array[lat, lon] = corr

         # Assign the p-value to the array
         pval_array[lat, lon] = pval

# Print the shape of the corr_array
print("shape of corr_array: ", corr_array.shape)
print("shape of pval_array: ", pval_array.shape)

In [None]:
# Plot these values
# Set up a single subplot
fig = plt.figure(figsize=(10, 5))

# Plot the correlation
ax = fig.add_subplot(1, 2, 1, projection=ccrs.PlateCarree())

# Plot the correlation
img = ax.imshow(corr_array, 
                transform=ccrs.PlateCarree(),
                cmap="RdBu_r",
                vmin=-1,
                vmax=1)


# Add coastlines
ax.coastlines()
