## 1. Execute full pipeline

In [1]:
import random
import time

from keras_tuner import RandomSearch

import numpy as np

import pandas as pd

import tensorflow as tf

from config.constants import (
    FORECAST_HORIZON, FORECASTER_MODEL, FORECASTER_OBJECTIVE,
    NB_TRIALS, OBSERVATION_WINDOW, SEED, TRAIN_PERC
)

from src.cut_point_detector import CutPointMethod, CutPointModel, get_cut_point_detector
from src.dataset import read_dataset, split_X_y, split_train_test
from src.forecaster import InternalForecaster, TimeSeriesHyperModel
from src.scaler import Scaler
from src.utils import get_error_results

tf.get_logger().setLevel('ERROR')
tf.keras.mixed_precision.set_global_policy("mixed_float16")

np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)


In [2]:
timestamp = 'validate_pipeline'
dataset_domain_argv = 'INMET'
dataset_argv = 'VITORIA_ES'
cut_point_model_argv = 'Window'
cut_point_method_argv = 'Linear'

In [3]:
execution_id = f"{timestamp}_{dataset_domain_argv}_{dataset_argv}_{cut_point_model_argv}_{cut_point_method_argv}_{SEED}"

In [4]:
print(f"Extracting cut point model enum ({cut_point_model_argv})")
cut_point_model = CutPointModel.from_str(cut_point_model_argv)

print(f"Extracting cut point model enum ({cut_point_method_argv})")
cut_point_method = CutPointMethod.from_str(cut_point_method_argv)

print(f"Reading dataset {dataset_argv} from {dataset_domain_argv}")
df, variables = read_dataset(dataset_domain_argv, dataset_argv)
print(f"Variables: {variables}")

Extracting cut point model enum (Window)
Extracting cut point model enum (Linear)
Reading dataset VITORIA_ES from INMET
Variables: ['P', 'PrA', 'T', 'UR', 'VV']


In [5]:
print("Splitting data into train and test")
train, test = split_train_test(df)

print("Initializing report")
cut_point_approach = f"{cut_point_model.value.title()} {cut_point_method.value.title()}"
report = {
    'execution_id': execution_id,
    'timestamp': timestamp,
    'cut_point_model': cut_point_model.value,
    'cut_point_method': cut_point_method.value,
    'cut_point_approach': cut_point_approach,
    'seed': SEED,
    'forecaster_model': FORECASTER_MODEL,
    'forecaster_objective': FORECASTER_OBJECTIVE,
    'observation_window': OBSERVATION_WINDOW,
    'train_perc': TRAIN_PERC,
    'nb_trials': NB_TRIALS,
    'dataset_domain': dataset_domain_argv,
    'dataset': dataset_argv,
    'variables': variables,
    'dataset_shape': df.shape,
    'train_shape': train.shape,
    'test_shape': test.shape,
}
report

Splitting data into train and test
Initializing report


{'execution_id': 'validate_pipeline_INMET_VITORIA_ES_Window_Linear_42',
 'timestamp': 'validate_pipeline',
 'cut_point_model': 'Window',
 'cut_point_method': 'Linear',
 'cut_point_approach': 'Window Linear',
 'seed': 42,
 'forecaster_model': 'LSTM',
 'forecaster_objective': 'val_loss',
 'observation_window': 14,
 'train_perc': 0.8,
 'nb_trials': 15,
 'dataset_domain': 'INMET',
 'dataset': 'VITORIA_ES',
 'variables': ['P', 'PrA', 'T', 'UR', 'VV'],
 'dataset_shape': (6575, 6),
 'train_shape': (5260, 6),
 'test_shape': (1315, 6)}

In [6]:
print(f"Started cut point for {cut_point_approach}")
start_time = time.time()
cut_point_detector = get_cut_point_detector(cut_point_model, cut_point_method)
cut_point, cut_point_perc = cut_point_detector.find_cut_point(train, variables)
end_time = time.time()
cut_duration = end_time - start_time
print(f"Cut point: {cut_point}, Cut point percentage: {cut_point_perc}")
print(f"Finished cut point for {cut_point_approach}, duration: {cut_duration}")

report.update({
    'cut_duration': cut_duration,
    'cut_point': str(cut_point),
    'cut_point_perc': str(cut_point_perc)
})
report

Started cut point for Window Linear
Cut point: 2290, Cut point percentage: 43.536121673003805
Finished cut point for Window Linear, duration: 0.036301612854003906


{'execution_id': 'validate_pipeline_INMET_VITORIA_ES_Window_Linear_42',
 'timestamp': 'validate_pipeline',
 'cut_point_model': 'Window',
 'cut_point_method': 'Linear',
 'cut_point_approach': 'Window Linear',
 'seed': 42,
 'forecaster_model': 'LSTM',
 'forecaster_objective': 'val_loss',
 'observation_window': 14,
 'train_perc': 0.8,
 'nb_trials': 15,
 'dataset_domain': 'INMET',
 'dataset': 'VITORIA_ES',
 'variables': ['P', 'PrA', 'T', 'UR', 'VV'],
 'dataset_shape': (6575, 6),
 'train_shape': (5260, 6),
 'test_shape': (1315, 6),
 'cut_duration': 0.036301612854003906,
 'cut_point': '2290',
 'cut_point_perc': '43.536121673003805'}

In [7]:
print("Applying subset to train based on cut point")
reduced_train = cut_point_detector.apply_cut_point(train, cut_point)

print("Training and applying scaler")
scaler = Scaler(variables)
scaled_reduced_train = scaler.fit_scale(reduced_train)
scaled_test = scaler.scale(test)

Applying subset to train based on cut point
Training and applying scaler


In [8]:
print("Splitting into X and y")
X_reduced_scaled_train, y_reduced_scaled_train = split_X_y(scaled_reduced_train)
X_scaled_test, y_scaled_test = split_X_y(scaled_test)

Splitting into X and y


In [9]:
y_reduced_scaled_train[0]

array([[0.02030948, 0.6347973 , 0.36263097, 0.5342772 , 0.39617486],
       [0.02030948, 0.61959459, 0.3806752 , 0.53502235, 0.28961749],
       [0.01547389, 0.60962838, 0.34284051, 0.59687034, 0.26502732],
       [0.        , 0.57601351, 0.27793946, 0.67213115, 0.24453552],
       [0.        , 0.49425676, 0.36699651, 0.65052161, 0.28551913],
       [0.00096712, 0.50608108, 0.36874272, 0.56631893, 0.31147541],
       [0.00870406, 0.49087838, 0.40570431, 0.65350224, 0.25      ]])

In [10]:
y_reduced_scaled_train[1]

array([[0.02030948, 0.61959459, 0.3806752 , 0.53502235, 0.28961749],
       [0.01547389, 0.60962838, 0.34284051, 0.59687034, 0.26502732],
       [0.        , 0.57601351, 0.27793946, 0.67213115, 0.24453552],
       [0.        , 0.49425676, 0.36699651, 0.65052161, 0.28551913],
       [0.00096712, 0.50608108, 0.36874272, 0.56631893, 0.31147541],
       [0.00870406, 0.49087838, 0.40570431, 0.65350224, 0.25      ],
       [0.01160542, 0.44966216, 0.45750873, 0.64307004, 0.33743169]])

In [11]:
y_reduced_scaled_train[2]

array([[0.01547389, 0.60962838, 0.34284051, 0.59687034, 0.26502732],
       [0.        , 0.57601351, 0.27793946, 0.67213115, 0.24453552],
       [0.        , 0.49425676, 0.36699651, 0.65052161, 0.28551913],
       [0.00096712, 0.50608108, 0.36874272, 0.56631893, 0.31147541],
       [0.00870406, 0.49087838, 0.40570431, 0.65350224, 0.25      ],
       [0.01160542, 0.44966216, 0.45750873, 0.64307004, 0.33743169],
       [0.        , 0.45793919, 0.44091967, 0.59985097, 0.23907104]])

In [12]:
print(f"Started running HPO and NAS for {cut_point_approach}")
forecaster_hypermodel = TimeSeriesHyperModel(
    model_type=FORECASTER_MODEL,
    n_variables=len(variables)
)
forecaster_tuner = RandomSearch(
    forecaster_hypermodel,
    objective=FORECASTER_OBJECTIVE,
    max_trials=NB_TRIALS,
    executions_per_trial=1,
    directory=f"outputs/tuner/{execution_id}",
    project_name=f"{cut_point_model.value}_{cut_point_method.value}",
    seed=SEED,
    overwrite=True,
    distribution_strategy=tf.distribute.MirroredStrategy()
)

Started running HPO and NAS for Window Linear


2025-02-24 21:50:00.925437: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-02-24 21:50:00.925453: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-24 21:50:00.925457: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-24 21:50:00.925470: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-24 21:50:00.925479: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
start_time = time.time()
forecaster_tuner.search(
    X_reduced_scaled_train,
    y_reduced_scaled_train,
    validation_split=(1 - TRAIN_PERC),
    shuffle=False,
)
end_time = time.time()
tuner_duration = end_time - start_time

Trial 15 Complete [00h 00m 32s]
val_loss: 0.020462638661265374

Best val_loss So Far: 0.018879062205087394
Total elapsed time: 00h 08m 20s


In [14]:
best_trial = forecaster_tuner.oracle.get_best_trials(num_trials=1)[0]
best_forecaster_model = forecaster_tuner.get_best_models(num_models=1)[0]
print(f"Finished running HPO and NAS for {cut_point_approach}, duration: {tuner_duration}")

print(f"Trial ID: {best_trial.trial_id}")
print(f"Hyperparameters: {best_trial.hyperparameters.values}")
print(f"Score: {best_trial.score}")
print("-" * 40)

Finished running HPO and NAS for Window Linear, duration: 500.31665992736816
Trial ID: 06
Hyperparameters: {'num_layers': 1, 'units_0': 128, 'learning_rate': 0.01, 'units_1': 64, 'units_2': 32, 'units_3': 32, 'batch_size': 16, 'epochs': 225}
Score: 0.018879062205087394
----------------------------------------


  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
print("Retrieving best model")
best_forecaster_model.summary()
best_forecaster_model = InternalForecaster(best_forecaster_model, len(variables))

Retrieving best model


In [16]:
print("Running forecasting")
y_scaled_pred = best_forecaster_model.forecast(X_scaled_test)
y_scaled_test_flat = y_scaled_test.reshape(-1, len(variables))
y_scaled_pred_flat = y_scaled_pred.reshape(-1, len(variables))

Running forecasting
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


2025-02-24 21:58:22.177937: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-02-24 21:58:22.177957: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


In [17]:
y_scaled_test_flat

array([[0.1344294 , 0.51947394, 0.27956095, 0.83627848, 0.13544106],
       [0.04255319, 0.39138514, 0.3685681 , 0.74217586, 0.2       ],
       [0.05222437, 0.47195946, 0.24126892, 0.72131148, 0.19535519],
       ...,
       [0.00483559, 0.28040541, 0.64639115, 0.48956781, 0.39071038],
       [0.0212766 , 0.24594595, 0.51746217, 0.73994039, 0.1489071 ],
       [0.00096712, 0.33209459, 0.54336438, 0.61624441, 0.15846995]])

In [18]:
y_scaled_pred_flat

array([[0.01907649, 0.51364243, 0.37583646, 0.6886463 , 0.1761322 ],
       [0.01483142, 0.54357165, 0.39333132, 0.6790521 , 0.20700176],
       [0.02576859, 0.5666574 , 0.38901597, 0.67641467, 0.2107338 ],
       ...,
       [0.05211055, 0.3794846 , 0.64484084, 0.5683391 , 0.33845842],
       [0.04598378, 0.3854839 , 0.6371707 , 0.55571747, 0.33944267],
       [0.04523553, 0.39909592, 0.6426749 , 0.56529564, 0.3240776 ]],
      dtype=float32)

In [19]:
print("Calculating error")
y_test = scaler.descale(pd.DataFrame(y_scaled_test_flat, columns=variables))
y_pred = scaler.descale(pd.DataFrame(y_scaled_pred_flat, columns=variables))

Calculating error


In [20]:
y_test

Unnamed: 0,P,PrA,T,UR,VV
0,27.8,1017.709524,21.585714,90.761905,0.938095
1,8.8,1014.550000,22.860000,85.500000,1.135000
2,10.8,1016.537500,21.037500,84.333333,1.120833
3,0.0,1019.604348,21.304348,72.565217,1.413043
4,0.0,1021.050000,21.220833,68.541667,1.520833
...,...,...,...,...,...
9060,0.0,1013.000000,28.216667,63.708333,2.375000
9061,0.0,1013.029167,27.704167,66.208333,1.854167
9062,1.0,1011.812500,26.837500,71.375000,1.716667
9063,4.4,1010.962500,24.991667,85.375000,0.979167


In [21]:
y_pred

Unnamed: 0,P,PrA,T,UR,VV
0,3.945019,1017.565674,22.964060,82.506805,1.062203
1,3.067138,1018.303955,23.214527,81.970329,1.156355
2,5.328946,1018.873413,23.152744,81.822853,1.167738
3,3.086632,1019.257507,23.019547,81.960541,1.157886
4,3.512660,1019.476379,23.044657,81.633499,1.179192
...,...,...,...,...,...
9060,13.233783,1013.809387,26.703806,76.550705,1.471514
9061,12.584108,1014.269287,26.847340,76.256027,1.525754
9062,10.776463,1014.256409,26.815306,75.779633,1.557298
9063,9.509446,1014.404419,26.705494,75.073868,1.560300


In [22]:
len(X_scaled_test)

1295

In [23]:
len(X_scaled_test) * FORECAST_HORIZON

9065

In [24]:
error_results = get_error_results(y_test, y_pred, variables)
print(f"Obtained error results: {error_results}")

Obtained error results: {'Avg_MAPE': 3387538085996856.0, 'Avg_MAE': 3.6099546454942795, 'Avg_MSE': 38.20980586586211, 'Avg_RMSE': 4.857248540254888, 'Avg_R2': 0.24120370144399644, 'Avg_WAPE': 0.0161543879615712, 'P_MAPE': 1.693769042998428e+16, 'P_MAE': 6.946298873711991, 'P_MSE': 99.05320367036138, 'P_RMSE': 9.9525475969905, 'P_R2': -0.14791420831366908, 'P_WAPE': 2.221413931073139, 'PrA_MAPE': 0.0025388271182813322, 'PrA_MAE': 2.5789761775667426, 'PrA_MSE': 11.235440011549692, 'PrA_RMSE': 3.3519307885977736, 'PrA_R2': 0.318366137351483, 'PrA_WAPE': 0.0025382706428381235, 'T_MAPE': 0.056276842034056127, 'T_MAE': 1.3564156906963882, 'T_MSE': 3.222041192417913, 'T_RMSE': 1.7950045104171501, 'T_R2': 0.6103783756833381, 'T_WAPE': 0.05428892813152506, 'UR_MAPE': 0.10389859619163004, 'UR_MAE': 6.858571181646109, 'UR_MSE': 77.38639571348133, 'UR_RMSE': 8.796953774658665, 'UR_R2': 0.1982807684595561, 'UR_WAPE': 0.09579531704254357, 'VV_MAPE': 0.2146034220509999, 'VV_MAE': 0.30951130385017, 'V

In [25]:
print("Writing report")
report.update({
    'tuner_duration': tuner_duration,
    'total_duration': cut_duration + tuner_duration,
    'error_results': error_results,
    'scaled_reduced_train_shape': scaled_reduced_train.shape,
    'best_trial_id': best_trial.trial_id,
    'best_trial_hyperparameters': best_trial.hyperparameters.values,
    'best_trial_score': best_trial.score,
    'best_forecaster_model': best_forecaster_model.summary(),
})

Writing report


In [26]:
report

{'execution_id': 'validate_pipeline_INMET_VITORIA_ES_Window_Linear_42',
 'timestamp': 'validate_pipeline',
 'cut_point_model': 'Window',
 'cut_point_method': 'Linear',
 'cut_point_approach': 'Window Linear',
 'seed': 42,
 'forecaster_model': 'LSTM',
 'forecaster_objective': 'val_loss',
 'observation_window': 14,
 'train_perc': 0.8,
 'nb_trials': 15,
 'dataset_domain': 'INMET',
 'dataset': 'VITORIA_ES',
 'variables': ['P', 'PrA', 'T', 'UR', 'VV'],
 'dataset_shape': (6575, 6),
 'train_shape': (5260, 6),
 'test_shape': (1315, 6),
 'cut_duration': 0.036301612854003906,
 'cut_point': '2290',
 'cut_point_perc': '43.536121673003805',
 'tuner_duration': 500.31665992736816,
 'total_duration': 500.35296154022217,
 'error_results': {'Avg_MAPE': 3387538085996856.0,
  'Avg_MAE': 3.6099546454942795,
  'Avg_MSE': 38.20980586586211,
  'Avg_RMSE': 4.857248540254888,
  'Avg_R2': 0.24120370144399644,
  'Avg_WAPE': 0.0161543879615712,
  'P_MAPE': 1.693769042998428e+16,
  'P_MAE': 6.946298873711991,
  'P_M

## 2. What would be the error if we predicted the average values for all variables (Dummy Forecaster)?

In [27]:
X_train, y_train = split_X_y(train)
X_test, y_test = split_X_y(test)

In [28]:
train_targets_flat = pd.DataFrame(y_train.reshape(-1, len(variables)), columns=variables)
avg_values = train_targets_flat.mean(axis=0).to_numpy()

In [29]:
n_test = y_test.shape[0]
dummy_pred = np.tile(avg_values, (n_test, FORECAST_HORIZON, 1))

dummy_pred_flat = dummy_pred.reshape(-1, len(variables))
y_test_flat = pd.DataFrame(y_test.reshape(-1, len(variables)), columns=variables)

In [30]:
dummy_error_results = get_error_results(y_test_flat, dummy_pred_flat, variables)
print(f"Error metrics for Dummy Forecaster (predicting average values): \n{dummy_error_results}")

Error metrics for Dummy Forecaster (predicting average values): 
{'Avg_MAPE': 2261114385082150.5, 'Avg_MAE': 3.9238984111749913, 'Avg_MSE': 45.462017189560335, 'Avg_RMSE': 5.50774124980734, 'Avg_R2': -0.09497051544831821, 'Avg_WAPE': 0.017559272478680943, 'P_MAPE': 1.1305571925410234e+16, 'P_MAE': 5.371567440050315, 'P_MSE': 86.87492163980089, 'P_RMSE': 9.3206717375842, 'P_R2': -0.0067817415410229565, 'P_WAPE': 1.7178176407273023, 'PrA_MAPE': 0.0032553683276855356, 'PrA_MAE': 3.308651533836293, 'PrA_MSE': 16.483760913582557, 'PrA_RMSE': 4.060019816895301, 'PrA_R2': -4.001720891788629e-05, 'PrA_WAPE': 0.0032564290933628248, 'T_MAPE': 0.09793958285614805, 'T_MAE': 2.439719280732743, 'T_MSE': 8.636826647316436, 'T_RMSE': 2.938847843512222, 'T_R2': -0.04439832587726644, 'T_WAPE': 0.09764686858259194, 'UR_MAPE': 0.12709236016186135, 'UR_MAE': 8.080380657557988, 'UR_MSE': 115.07253217143328, 'UR_RMSE': 10.727186591620065, 'UR_R2': -0.1921457410352394, 'UR_WAPE': 0.11286062452579565, 'VV_MAPE

In [31]:
df_comparison = pd.DataFrame({
    "Trained Model": pd.Series(error_results),
    "Dummy Forecaster": pd.Series(dummy_error_results)
})

df_comparison = df_comparison.round(5)
df_comparison

Unnamed: 0,Trained Model,Dummy Forecaster
Avg_MAPE,3387538000000000.0,2261114000000000.0
Avg_MAE,3.60995,3.9239
Avg_MSE,38.20981,45.46202
Avg_RMSE,4.85725,5.50774
Avg_R2,0.2412,-0.09497
Avg_WAPE,0.01615,0.01756
P_MAPE,1.693769e+16,1.130557e+16
P_MAE,6.9463,5.37157
P_MSE,99.0532,86.87492
P_RMSE,9.95255,9.32067
