# Neural Network Trainings

_Author: Aline Van Driessche_

This notebook contains all code needed to perform Neural Network trainings (1D and 2D) on individual latitude information available from ECCO. Read section headings for more details on setup, preprocessing, and model training.

# Setup

Importing relevant modules and custom scripts. A number of helper functions are also defined to clean up experiment code.

In [None]:
import sys
import pandas as pd

import paths
import pickle

sys.path.append("..")
from models.utils import *
from models.plotting_utils import *
from sklearn.metrics import *
from models import train, CNN1D, CNN2D

import matplotlib.pyplot as plt

In [None]:
# A helper function to subset 30S, since the satellite observable variables do not always exist for all longitudes - the values are taken from an earlier defined basin function
def mask_30S(inputs):
    mask = np.logical_or(np.logical_or(np.logical_and(inputs.longitude >= -180, 
                                                      inputs.longitude <= -71),
                                       np.logical_and(inputs.longitude >= 31, 
                                                      inputs.longitude <= 115)),
                         np.logical_and(inputs.longitude >= 153, 
                                        inputs.longitude <= 180))
    masked = inputs.where(mask, drop = True)

    return masked

In [None]:
# Helper function to align the in-and outputs of two datasets with the inputs
def align_dates(inputs, outputs):

    # Align the dates according to the data available for in- and output
    date_range = (inputs.time.values[0], inputs.time.values[-1]) # grabbing start and end date for inputs
    date_range = tuple([str(d).split('T')[0] for d in date_range]) # extracting just the date, w/o time
    print('Date range to align to:', date_range)
    return align_inputs_outputs(inputs, outputs, date_range = date_range, ecco=False)

In [None]:
# A helper function to format the input files based on a given latitude
def format_lat_lon(value):
    """Format latitude or longitude with N/S or E/W suffix."""
    if value < 0:
        return f"{abs(value)}S"
    else:
        return f"{value}N"

## Experiment Variables

All variables that might be worth changing during experiments. Of particular interest is `no_zonal_averages` and `zonal_avgs` which controls which variables are averaged zonally and which variables are left with all longitudinal information.

In [None]:
data_home = paths.LOCAL_DIR
lats = [26, -30, -55, -60]

lat = lats[2]

zonal_avg = ['time']
no_zonal_avgs = ['time', 'longitude']   #for 2D training this setting is necessary

remove_season = False
remove_trend = False

input_vars = ['OBP', 'ZWS'] #other options are 'SSH', 'SSS', 'SST'

timelag=1

inputs_fp = f"{data_home}/ecco_data_minimal/{format_lat_lon(lat)}.nc"
outputs_fp = f"{data_home}/ecco_data_minimal/{format_lat_lon(lat)}_moc_density.pickle"

# HYPERPARAMETERS
dropout = 0.1
n_pure_layers = 1
n_mix_layers = 1
n_channels_mult = 4
kernel_size = 3           #For 2D input change this to (3x3)
model_iterations = 1

## Loading and processing relevant data
Data loading function that extracted and preprocesses the data as defined in the “Experiment Variables” section and returns the in- and output for training.  

In [None]:
def retrieve_data(lat=lat, inputs_fp=inputs_fp, outputs_fp=outputs_fp, timelag=timelag, coords = zonal_avg, input_vars=input_vars, remove_trend=remove_trend, remove_season=remove_season):

    # Retrieve the input covariates (ecco surface variables)
    inputs = xr.open_dataset(inputs_fp).isel(latitude=slice(1,2))

    # Some latitudes need specific preprocessing because of missing data or land that will obstruct training
    if lat == -30:
        inputs = mask_30S(inputs) 
    if lat == -55:
        inputs = inputs.dropna('longitude', how = 'all')
        
    # Retrieve the output streamfunctions to train on (in density space), this file contains the latitude right above and right below so we take the middle one
    with open(outputs_fp, 'rb') as f:
        outputs = pickle.load(f).astype(np.float64)
    outputs = np.expand_dims(outputs, 1) # grabbing just the middle latitude of interest 

    # Convert the data to an xarray dataset structure similar to the input data
    outputs = xr.Dataset(
                data_vars = {'moc' : (["time", "latitude"], outputs)}, 
                coords = {'time' : inputs.time, 'latitude' : np.atleast_1d(lat),}
            )
    
    # Align the date ranges
    inputs, outputs = align_inputs_outputs(inputs, outputs, ecco=False)
    
    # Input data preprocessing (standardize)
    pp_data_surface = apply_preprocessing(inputs,
                              mode="inputs",
                              remove_season=remove_season,
                              remove_trend=remove_trend,
                              standardize=True,
                              lowpass=False)
    X = reshape_inputs(pp_data_surface, history=timelag, keep_coords=coords, data_vars=input_vars)
    
    # Necessary transpose for convolutions with PyTorch Tensors + adjusting the shape to suit as CNN input
    dims = list(range(X.ndim))  
    dims[1], dims[-1] = dims[-1], dims[1]
    X = np.transpose(X, dims)
    if len(X.shape) == 2:
        X = np.expand_dims(X, axis=-1)
    
    # MOC strength preprocessing (don't standardize)
    strength = apply_preprocessing(outputs,
                               mode="outputs",
                               remove_season=remove_season,
                               remove_trend=remove_trend,
                               standardize=False,
                               lowpass=False)
    strength_np = strength.moc.squeeze().values
    
    # If the history parameter changed the input shape, change the outputs accordingly
    y = strength_np[-X.shape[0]:]
    
    """
    print()
    print('Inputs shape:', X.shape)
    print('Extra inputs shape:', X_extra.shape)
    print('Outputs shape:', y.shape)
    """
    return X, y

In [None]:
X, y = retrieve_data()

## Running Experiments

Data is extracted and preprocessed as defined in the "Experiment Variables" section. Grid search with cross-validation is performed to find the best regularization weights for ElasticNet (`alpha` and `L1_wt`); see the [`statsmodels` documentation](https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.fit_regularized.html#statsmodels.regression.linear_model.OLS.fit_regularized) for more details. Our five metrics (RMSE, MAE, MAPE, $R^2$, correlation) are then calculated and saved to a text file, and two helpul plots are saved as well.

In [54]:
import importlib
importlib.reload(train)

<module 'models.train' from 'C:\\Users\\aline\\OTP\\models\\train.py'>

In [57]:
def train_1DCNN(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, stratify=None, random_state=123456)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, shuffle=False, stratify=None, random_state=123456)

    device = "cuda" if t.cuda.is_available() else "cpu"

    Xt_train = t.tensor(X_train, device=device)
    Yt_train = t.tensor(y_train, device=device)
    Xt_valid = t.tensor(X_valid, device=device).double()
    Yt_valid = t.tensor(y_valid, device=device).double()
    Xt_test = t.tensor(X_test, device=device).double()
    Yt_test = t.tensor(y_test, device=device).double()

    model = CNN1D.CNN1D(
        n_pure_layers=n_pure_layers, 
        n_mix_layers=n_mix_layers, 
        n_features=Xt_train.shape[1], 
        n_channels=Xt_train.shape[1]*n_channels_mult,
        kernel_size=kernel_size,
        dropout=dropout, 
    ).double()

    model= train.train_model( #, val_loss
        model=model, 
        X_train = Xt_train, 
        y_train = Yt_train, 
        X_val = Xt_valid,
        y_val = Yt_valid, 
        early_stopping=True,
    )
    
    # Extracting all predictions in time order
    train_set_pred = train.predict(model, Xt_train, Yt_train)
    valid_set_pred = train.predict(model, Xt_valid, Yt_valid)
    test_set_pred = train.predict(model, Xt_test, Yt_test)
    
    predictions = {
        "train_set_pred": train_set_pred, 
        "valid_set_pred": valid_set_pred, 
        "test_set_pred": test_set_pred, 
        "all_preds": np.concatenate([train_set_pred, valid_set_pred, test_set_pred])
    }
    
    print('test set', np.array(test_set_pred).shape)
    print(y_test.shape)
    
    metrics = {
        "rmse": round(root_mean_squared_error(test_set_pred, y_test), 3),
        "mae": round(mean_absolute_error(test_set_pred, y_test), 3),
        "mape": round(mean_absolute_percentage_error(test_set_pred, y_test), 3)*100,        
        "cmape": round(custom_MAPE(np.array(test_set_pred).squeeze(), y_test.detach().cpu().numpy(), threshold=0.5), 3)*100,
        "test_corr": round(np.corrcoef(np.array(test_set_pred).squeeze(), y_test)[0, 1], 3),
        }
    
    return predictions, metrics

In [58]:
X, y = retrieve_data()
train_1DCNN(X, y)

axes: ['time', 'history', 'feature']
variables: ['OBP', 'ZWS']
shape: (288, 1, 2)
device: cpu
137 parameters.


 14%|█▎        | 681/5000 [00:02<00:15, 272.03it/s]

early stopping





AttributeError: module 'models.train' has no attribute 'predict'

In [None]:
# To loop over several models and compare the performances
models = []
history = 6 #np.arange(6)
for i in range(10):
    X, y= get_input(outputs, inputs, history=history, coords=["time", "longitude"])
    Xt_train, Yt_train, Xt_valid, Yt_valid, Xt_test, Yt_test = define_train_and_validation_sets(X, y)
    models.append(train_CNN(Xt_train, Yt_train))

predictions_models = []
metrics_models = []

for i in range(len(models)):
    X, y= get_input(outputs, inputs, history=history)
    Xt_train, Yt_train, Xt_valid, Yt_valid, Xt_test, Yt_test = define_train_and_validation_sets(X, y)
    predictions, metrics = predict_CNN(models[i], Xt_train, Yt_train, Xt_valid, Yt_valid, Xt_test, Yt_test )
    predictions_models.append(predictions)
    metrics_models.append(metrics)

metrics_df = pd.DataFrame(metrics_models)
summary_metrics_df = pd.DataFrame({
                "mean": metrics_df.mean().round(3), 
                "std": metrics_df.std().round(3)}
            )
display(summary_metrics_df)

predictions_df = pd.DataFrame(predictions_models)

In [None]:
metrics_df.to_csv(f"{paths.LOCAL_DIR}/{lat}_history.csv")

In [45]:
plot_30 = pd.read_csv(f"{paths.LOCAL_DIR}/-30_history.csv")
plot_55 = pd.read_csv(f"{paths.LOCAL_DIR}/-55_history.csv")
plot_60 = pd.read_csv(f"{paths.LOCAL_DIR}/-60_history.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'H:/.shortcut-targets-by-id/1wvJjD0RMTujKYaXQapEiGk-Mx03_KSin/GTC/-30_history.csv'

In [None]:
# Plotting
plt.figure(figsize=(10, 3.5))  # Optional: specifies the figure size
plt.plot(plot_30.index, plot_30['mape'], marker='x', linestyle='-', linewidth=1.5, label='30S indo-pacific')  
plt.plot(plot_55.index, plot_55['mape'], marker='x', linestyle='-', linewidth=1.5, label='55S southern') 
plt.plot(plot_60.index, plot_60['mape'], marker='x', linestyle='-', linewidth=1.5, label='60S southern') 
plt.xlabel('amount of historical data included in training [months]', fontsize=12)
plt.ylabel('MAPE [%]', fontsize=12, weight="bold")
plt.legend(fontsize=12) 
plt.tick_params(labelsize=12)
plt.xticks(rotation=45)
plt.tight_layout()  # Adjusts subplot params so that the subplot(s) fits in to the figure area.
plt.savefig(f"C:/Users/aline/OTP/plots/history1DCNN.png", dpi=400)
plt.show()

## Output visualisation

In [None]:
#predictions_df = pd.DataFrame(predictions)
#mean_y = {col: np.mean(np.stack(predictions_df[col].values), axis=0) for col in predictions_df}

predictions = predictions_models[9]

print(len(predictions['all_preds']))

time = outputs.time.values[-X.shape[0]:]
fig, ax = timeseries_comparison(predictions['all_preds'], 
                                y,
                                time, 
                                len(Xt_train))
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True, squeeze=True, figsize=(10, 6))

all_moc = [y, y]
all_pred = [predictions_models[0]['all_preds'], predictions_models[1]['all_preds']]
sections = ['without historical data: RMSE = 3.97, MAPE$_{0.5}\\downarrow$ = 61.30, correlation = 0.85.', 'with historical data (6m): RMSE = 4.95, MAPE$_{0.5}\\downarrow$ = 42.90, correlation = 0.56.']

for index, ax in enumerate(axs):
    ax.plot(all_moc[index], label="ECCO")
    ax.plot(all_pred[index], label="Predicted")
    # all this stuff is emilio's plotting code
    y_lower, y_upper = ax.get_ylim()
    x_pos = np.arange(len(Xt_train))
    ax.fill_between(x = x_pos, 
                    y1 = np.repeat(y_lower, len(x_pos)), 
                    y2 = np.repeat(y_upper, len(x_pos)),
                    alpha = 0.2, 
                    color = 'gray')
    ax.margins(x = 0, y = 0)
    ax.set_title(sections[index])
# and here also, just emilio's code
ax.set_xticks(np.arange(0, 324, 60), np.arange(1992, 2019, 5), rotation=45)
ax.legend(loc = 'lower right', edgecolor = 'black', framealpha = 1)
fig.supylabel("MOC Strength [Sv]", weight="bold")
plt.tight_layout()
# dpi makes the plot more hd
plt.savefig(f"C:/Users/aline/OTP/plots/55S_historicaldata.png", dpi=400)
plt.show()

In [None]:
# Alternate view: looking at a scatterplot of predicted vs. actual
y_pred_train = np.array(mean_y['train_set_pred']).squeeze()
y_pred_test = np.array(mean_y['test_set_pred']).squeeze()

fig, ax = pred_vs_actual(y_pred_train, y_pred_test, y_train, y_test)
plt.show()