# Load Data

In [7]:
import pandas as pd
df_batch1 = pd.read_csv("../data/4.processed_data/Query4_results_test_processed.csv")
df_batch2 = pd.read_csv("../data/4.processed_data/Query1503_results_conv_processed.csv")
combined_df = pd.concat([df_batch1, df_batch2], ignore_index=True)

# Run tuning

In [8]:
import optuna
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_score
import pandas as pd


X = pd.get_dummies(combined_df[['Journey_route','days_until_departure','Detected_Country','Detected_Currency']])
y = combined_df['average_savings_for_Journey_route_in_Detected_Country']


# Define the objective function for the Optuna study
def objective(trial):
    # Define the hyperparameter space
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'eta': trial.suggest_float('eta', 0.01, 0.38),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.7, 0.8, 0.9]),
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
    }
    
    # Initialize the model with the current set of hyperparameters
    model = XGBRegressor(**params)
    
    # Use LeaveOneOut as the cross-validation strategy
    loo = LeaveOneOut()
    
    # Calculate the cross-validated scores
    scores = cross_val_score(model, X, y, cv=loo, scoring='neg_mean_absolute_error')
    
    # The objective is to maximize the negative mean absolute error (as Optuna minimizes the objective function)
    return np.mean(scores)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Adjust n_trials to your preference

# Print the best parameters and the best score
print(f"Best parameters found: {study.best_params}")
print(f"Best MAE score: -{study.best_value}")

[I 2024-03-17 14:34:10,073] A new study created in memory with name: no-name-13ccb72c-a616-45fd-8d3f-164fc916070a
[I 2024-03-17 14:37:33,521] Trial 0 finished with value: -1.8666428087360258 and parameters: {'max_depth': 3, 'eta': 0.0534291869972452, 'subsample': 0.9, 'colsample_bytree': 0.8}. Best is trial 0 with value: -1.8666428087360258.
[I 2024-03-17 14:41:30,688] Trial 1 finished with value: -0.5112922101791915 and parameters: {'max_depth': 4, 'eta': 0.3528541350362156, 'subsample': 0.8, 'colsample_bytree': 0.7}. Best is trial 1 with value: -0.5112922101791915.
Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x17c40dc10>>
Traceback (most recent call last):
  File "/Users/aghauss/Library/Caches/pypoetry/virtualenvs/playwright-trial-uIq0qThi-py3.12/lib/python3.12/site-packages/xgboost/core.py", line 641, in _next_wrapper
    return self._handle_exception(lambda: self.next(input_data), 0)
