In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
import json
from sklearn.metrics import root_mean_squared_error
import pandas as pd
from sklearn.pipeline import Pipeline
import optuna
import sys
sys.path.append('../')
from src.model_training_utils import create_objective
from src.visualization_utils import plot_results_compiration

In [10]:
df_train = pd.read_csv('../data/temp/training_cleaned.csv', sep='\t')
df_test = pd.read_csv('../data/temp/validation_cleaned.csv', sep='\t')

# Hyperparameters tuning

Hyperparameter optimization for selected models using the Optuna library. A cross-validation method that accounts for temporal dependencies in the data is applied. Automatic feature selection is also performed.


In [11]:
FEATURES = ['Route', 'ActualTotalFuel', 'FLownPassengers', 'BagsCount', 'FlightBagsWeight']
TARGET = 'ActualTOW'

In [12]:
X = df_train[FEATURES]
y = df_train[TARGET]

train_size = int(len(df_train)*0.8)
X_train, X_val = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_val = y.iloc[:train_size], y.iloc[train_size:]

In [13]:
optuna_best_results = {}
for model_class in [LinearRegression, Ridge, Lasso, RandomForestRegressor, XGBRegressor]:    
    objective = create_objective(X_train, y_train, model_class)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)

    optuna_best_results[model_class.__name__] = study.best_params

[I 2025-05-16 13:57:55,861] A new study created in memory with name: no-name-47ae628b-02a9-40dc-890b-9e7059a2e1e3
[I 2025-05-16 13:57:55,926] Trial 0 finished with value: 1163.537540000881 and parameters: {'n_features': 4}. Best is trial 0 with value: 1163.537540000881.
[I 2025-05-16 13:57:55,985] Trial 1 finished with value: 1341.369617548255 and parameters: {'n_features': 3}. Best is trial 0 with value: 1163.537540000881.
[I 2025-05-16 13:57:56,045] Trial 2 finished with value: 1163.537540000881 and parameters: {'n_features': 4}. Best is trial 0 with value: 1163.537540000881.
[I 2025-05-16 13:57:56,101] Trial 3 finished with value: 1163.5681006421742 and parameters: {'n_features': 5}. Best is trial 0 with value: 1163.537540000881.
[I 2025-05-16 13:57:56,153] Trial 4 finished with value: 1343.1838530029045 and parameters: {'n_features': 2}. Best is trial 0 with value: 1163.537540000881.
[I 2025-05-16 13:57:56,211] Trial 5 finished with value: 1163.537540000881 and parameters: {'n_feat

In [14]:
with open('../best_models_params.json', 'w') as f:
    json.dump(optuna_best_results, f)

# Training of models with the best parameters and reasults compiration

In [15]:
with open('../best_models_params.json', 'r') as f:
    best_results = json.load(f)
    
for model_class in [LinearRegression, Ridge, Lasso, RandomForestRegressor, XGBRegressor]:  
    best_params = best_results[model_class.__name__]
    n_features = best_params.pop("n_features")

    final_model = Pipeline([
        ("scaler", StandardScaler()),
        ("feature_selection", SelectKBest(score_func=f_regression, k=n_features)),
        ("regressor", model_class(**best_params))
    ])

    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    print(f'########## {model_class.__name__} ##########')
    print(f"RMSE on the validation set: {rmse:.2f}")
    plot_results_compiration(y_pred, y_val)

########## LinearRegression ##########
RMSE on the validation set: 1238.46


########## Ridge ##########
RMSE on the validation set: 1238.46


########## Lasso ##########
RMSE on the validation set: 1238.46


########## RandomForestRegressor ##########
RMSE on the validation set: 1172.13


########## XGBRegressor ##########
RMSE on the validation set: 1101.12
