# Load Data

In [6]:
import pandas as pd
df_batch1 = pd.read_csv("../data/4.processed_data/Query4_results_test_processed.csv")
df_batch2 = pd.read_csv("../data/4.processed_data/Query1503_results_conv_processed.csv")
combined_df = pd.concat([df_batch1, df_batch2], ignore_index=True)

# Run tuning

In [4]:
import optuna
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_score
import pandas as pd


X = pd.get_dummies(combined_df[['Journey_route','days_until_departure','Detected_Country','Detected_Currency']])
y = combined_df['average_savings_for_Journey_route_in_Detected_Country']


# Define the objective function for the Optuna study
def objective(trial):
    # Define the hyperparameter space
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'eta': trial.suggest_float('eta', 0.01, 0.38),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.7, 0.8, 0.9]),
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
    }
    
    # Initialize the model with the current set of hyperparameters
    model = XGBRegressor(**params)
    
    # Use LeaveOneOut as the cross-validation strategy
    loo = LeaveOneOut()
    
    # Calculate the cross-validated scores
    scores = cross_val_score(model, X, y, cv=loo, scoring='neg_mean_absolute_error')
    
    # The objective is to maximize the negative mean absolute error (as Optuna minimizes the objective function)
    return np.mean(scores)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Adjust n_trials to your preference

# Print the best parameters and the best score
print(f"Best parameters found: {study.best_params}")
print(f"Best MAE score: -{study.best_value}")

[I 2024-03-17 13:45:19,295] A new study created in memory with name: no-name-022f0505-da72-476b-b721-38a0e34b74e9
[I 2024-03-17 13:48:57,112] Trial 0 finished with value: -0.9773205825228116 and parameters: {'max_depth': 4, 'eta': 0.1621228208753755, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 0 with value: -0.9773205825228116.
[I 2024-03-17 13:53:56,724] Trial 1 finished with value: -0.4435390041125549 and parameters: {'max_depth': 6, 'eta': 0.14516580413734517, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 1 with value: -0.4435390041125549.
[I 2024-03-17 13:59:52,559] Trial 2 finished with value: -0.0645182095263273 and parameters: {'max_depth': 8, 'eta': 0.2636706431797748, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 2 with value: -0.0645182095263273.
Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x161bc4c80>>
Traceback (most recent call last):
  F