In [1]:
# Modules
import numpy as np
import xarray as xr
import pandas as pd

from data import data_loader
from data import gesla_preprocessing
from data import era5_preprocessing
from data import preprocessing

# Description

1. rf006 combine timelags of one predictor <br>



# Working Area

# Modularized Preprocessing

In [3]:
#---
# Modularize Preprocessing
#---

# Get timeseries of predictor and predictand
season = "winter" # ["winter", "autumn",] 
predictors = ["sp", "tp", "u10", "v10",]
percentile = 0.95 # [0.95, 0.99,] 
preprocess = "preprocess1" # ["preprocess1"]
range_of_years = "1999-2008" # ["1999-2008", "2009-2018", "2019-2022",]
subregion = "lon-0530_lat7040" # ["lon-0530_lat7040"]
station_names = ["hanko-han-fin-cmems",]

In [4]:
# Load already preprocessed Era5 Data
# Preprocessing done with cdo
#---
predictor = predictors[0]
era5_predictor = data_loader.load_daymean_era5(range_of_years, subregion, season, predictor, preprocess)

era5_predictor.shape

(903, 121, 141)

In [5]:
#---
# Preprocess GESLA Data
#---

# Load Predictand
#---
gesla_predictand = data_loader.load_gesla(station_names)

# Select a season
#---
gesla_predictand = gesla_preprocessing.select_season(gesla_predictand, season)

# Select only sea_level analysis data
#---
gesla_predictand = gesla_preprocessing.get_analysis(gesla_predictand)

# Subtract mean of data grouped by station
#---
gesla_predictand = gesla_predictand["sea_level"] # Detrend expects pd.Series
gesla_predictand = gesla_preprocessing.detrend(gesla_predictand, level="station")

# Apply one hot encoding
gesla_predictand = gesla_preprocessing.apply_dummies(gesla_predictand, percentile=percentile, level="station")
print(f"Applied one-hot-encoding with Percentile: {percentile}")

# Convert to DataArray
# nan values: no measurement at that timestamp for specific station
gesla_predictand = gesla_predictand.to_xarray()

Load Predictand from GESLA
Applied one-hot-encoding with Percentile: 0.95


In [6]:
#---
# Get overlapping time-series
#---
X, Y, t = preprocessing.intersect_time(era5_predictor, gesla_predictand)

print(f"X: {X.shape}")
print(f"Y: {Y.shape}")
print(f"t: {t.shape}")

Get overlapping timeseries of ERA5 and GESLA
X: (903, 121, 141)
Y: (903, 1)
t: (903,)


In [16]:
# Initialize timelags
#---
def combine_timelags(X, Y, timelags):
    """
    Description:
        Returns combined timelagged predictor data X_timelag for predictand Y_timelag.
        Shifts predictand data Y according to the maximum timelag given in timelags.
        Note: Input data X, Y needs to be on the same time-interval (see preprocessing.intersect_time)
        
    Parameters:
        X (np.array, float): Predictor values as a field time series. Shape:(n_labels, lat, lon)
        Y (np.array, float): Predictand at selected stations. Shape:(n_labels, stations)

    Returns:
        X_timelag (np.array, float): Combined timelagged Predictor values in increasing order of timelags, e.g. t=0, t=1,..., Shape:(timelag, n_labels, lat, lon)
        Y_timelag (np.array, float): Timelagged Predictand at selected stations. Shape:(n_labels, stations)
    """

    # Initialize
    #---
    timelags.sort()
    max_timelag = max(timelags)

    # Get timelagged Predictand 
    #---
    Y_timelag = Y[max_timelag:]

    # Get timelagged predictors
    #---
    X_timelag = []

    for timelag_ in timelags:

        assert timelag_ >= 0, f"Timelag = {timelag_} needs to be a positive integer"

        idx = max_timelag - timelag_

        if timelag_ > 0:
            X_tmp = X[idx : - timelag_]
        if timelag_ == 0: 
            X_tmp = X[idx:]

        X_timelag.append(X_tmp)

    X_timelag = np.array(X_timelag)

    return X_timelag, Y_timelag

In [18]:
timelags = [0, 1, 2, 5]
X_timelag, Y_timelag = combine_timelags(X, Y, timelags)
X_timelag.shape

(4, 898, 121, 141)

In [19]:
# Reshape for model input
#---
ndim = Y_timelag.shape[0]
nlat = X_timelag.shape[2]
nlon = X_timelag.shape[3]

X_timelag = X_timelag.swapaxes(0, 1)
X_timelag = X_timelag.reshape(ndim, -1)

y_timelag = Y_timelag[:, 0] # Select one station
print(X_timelag.shape)
print(y_timelag.shape)

(898, 68244)
(898,)


In [20]:
#---
# Handle NaN Values
#---

# Insert numerical value that is not in data.
# ML will hopefully recognize it.
X_timelag[np.where(np.isnan(X_timelag))] = -999

In [None]:
# Modelfit as before (see rf005)
# Evaluation as before (see rf005)

# Main run

In [2]:
#---
# Main
#---
from models.random_forest.rf006 import run

season = "winter" # ["winter", "autumn",] 
percentile = 0.95 # [0.95, 0.99,] 
station_names = ["hanko-han-fin-cmems",]
predictors = ["sp", "tp", "u10", "v10",]
timelags_lst = [ # List of all combination of timelags applied to all predictors
    [0, 2, 7,],
    [1, 3,],
    [2, 3,],
    [3, 4,],   
    [2, 4, 6,], 
    [2, 3, 4, 5,],
]
model_run = "rf006"

run_id = 0
for predictor in predictors:
    for timelags in timelags_lst:
        print(f"Starting run for {predictor} and timelags = {timelags}")
        run(season, predictor, timelags, percentile, station_names, run_id, model_run)
        print(f"Ended run for {predictor} and timelags = {timelags}")
        
        run_id = run_id + 1

Starting run for sp and timelags = [0, 2, 7]
Load Predictand from GESLA
Applied one-hot-encoding with Percentile: 0.95
Get overlapping timeseries of ERA5 and GESLA
X: (903, 121, 141)
Y: (903, 1)
Data is prepared as follows
X.shape : (896, 51183)
y.shape : (896,)
Start Model Training
Do Train-Test-Split
Optimize Hyperparameters
Tested Hyperparameters: {'n_estimators': [0, 111, 222, 333, 444, 555, 666, 777, 888, 1000], 'max_depth': [5, 17, 30, 42, 55, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
Optimize Hyperparameters using RandomSearchCV
Fitting 3 folds for each of 100 candidates, totalling 300 fits
