In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as XGBRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
import optuna

In [6]:
df_train = pd.read_csv('../data/temp/training_cleaned.csv', sep='\t')
df_test = pd.read_csv('../data/temp/validation_cleaned.csv', sep='\t')

# Modele

- Regresja liniowa,

- Regresja grzbietowa

- Regresja Lasso

- Random Forest Regression,

- XGBoost

In [18]:
FEATURES = ['ActualTotalFuel', 'FLownPassengers', 'BagsCount', 'FlightBagsWeight']
TARGET = 'ActualTOW'

In [None]:
X = df_train[FEATURES]
y = df_train[TARGET]

train_size = int(len(df_train) * 0.8)
X_train, X_val = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_val = y.iloc[:train_size], y.iloc[train_size:]

def objective(trial):
    n_features = trial.suggest_int("n_features", 1, len(FEATURES))

    model = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 50, 200),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("feature_selection", SelectKBest(score_func=f_regression, k=n_features)),
        ("regressor", model)
    ])

    scores = cross_val_score(
        pipeline, X_train, y_train,
        scoring = "neg_root_mean_squared_error",
        cv = TimeSeriesSplit(n_splits=5)
    )
    return -scores.mean()

# === Uruchomienie Optuny ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Najlepsze parametry:", study.best_params)



[I 2025-05-15 18:20:26,588] A new study created in memory with name: no-name-796cbe49-00c9-4742-996f-f1503beb7e7b
[I 2025-05-15 18:20:32,735] Trial 0 finished with value: 1363.7247266904737 and parameters: {'n_features': 3, 'n_estimators': 195, 'max_depth': 13, 'min_samples_split': 8}. Best is trial 0 with value: 1363.7247266904737.
[I 2025-05-15 18:20:34,084] Trial 1 finished with value: 2155.131330757308 and parameters: {'n_features': 1, 'n_estimators': 114, 'max_depth': 18, 'min_samples_split': 7}. Best is trial 0 with value: 1363.7247266904737.
[I 2025-05-15 18:20:35,124] Trial 2 finished with value: 1193.1582428281429 and parameters: {'n_features': 4, 'n_estimators': 75, 'max_depth': 10, 'min_samples_split': 7}. Best is trial 2 with value: 1193.1582428281429.
[I 2025-05-15 18:20:36,174] Trial 3 finished with value: 2142.9483378612617 and parameters: {'n_features': 1, 'n_estimators': 107, 'max_depth': 15, 'min_samples_split': 6}. Best is trial 2 with value: 1193.1582428281429.
[I 2

Najlepsze parametry: {'n_features': 4, 'n_estimators': 149, 'max_depth': 8, 'min_samples_split': 2}


In [27]:
best_params = study.best_params

final_model = Pipeline([
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_regression, k=best_params["n_features"])),
    ("regressor", RandomForestRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        random_state=42,
        n_jobs=-1
    ))
])

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)
print(f"RMSE na zbiorze walidacyjnym: {rmse:.2f}")


RMSE na zbiorze walidacyjnym: 1205.27
