### Classic Machine Learning
This notebook implements a five-fold cross-validation model selection pipeline for classic machine learning algorithms. Once the best model has been selected, it is passed to a hyperparameter selection algorithm using Optuna. The tuned best model is then trained on the entire dataset with five buoys withheld for validation and evaluation of prediction accuracy.

Package imports

In [None]:
# Core Libraries
import gc
import glob
import math
import os
import time
from datetime import datetime, timedelta

# Data Handling
import netCDF4 as nc
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Geospatial Calculations
from geopy import Point
from geopy.distance import great_circle
from haversine import haversine
from scipy.spatial import cKDTree

# Machine Learning Models
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    VotingRegressor
)
from sklearn.linear_model import (
    BayesianRidge,
    ElasticNet,
    Lasso,
    LinearRegression,
    Ridge
)
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

# Model Evaluation and Optimization
import optuna
from optuna import create_study
from scipy.stats import randint, uniform
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import (
    GroupKFold,
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    train_test_split
)

Function to pre-process ERA5 spatial references

In [None]:
# Precompute the KDTree and valid_time differences
def precompute_kdtree_and_time_diffs(uwnd_nc_file_path):
    try:
        print("Precomputing KDTree and time differences...")
        # Load the NetCDF file
        ds = nc.Dataset(uwnd_nc_file_path)

        # Extract the valid_time, latitudes, and longitudes from the NetCDF file
        valid_time = ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
        latitudes = ds.variables['latitude'][:]
        longitudes = ds.variables['longitude'][:]

        # Convert valid_time from seconds since 1970-01-01 to datetime
        base_time = datetime(1970, 1, 1)
        valid_time_dt = np.array([base_time + timedelta(seconds=int(ts)) for ts in valid_time], dtype='datetime64[ns]')

        # Create a KDTree for fast spatial lookup
        lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
        tree = cKDTree(lat_lon_pairs)

        print("KDTree and time differences precomputed successfully.")
        return tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs
    except Exception as e:
        print(f"Error precomputing KDTree and time differences: {e}")
        raise

uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023_pressure1.nc'

try:
    tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs = precompute_kdtree_and_time_diffs(uwnd_nc_file_path)
except Exception as e:
    print(f"Error precomputing KDTree and time differences: {e}")
    raise

Function to preload ERA5 data

In [None]:
import numpy as np
import netCDF4 as nc
from scipy.spatial import cKDTree

# Preload all ERA5 data into memory as arrays
def load_era5_data(era5_files):
    """
    Preloads ERA5 data into memory and creates KDTree for spatial lookup.

    Args:
        era5_files (dict): Dictionary where keys are variable names and values are file paths to ERA5 NetCDF files.

    Returns:
        dict: A dictionary with preloaded data arrays, latitude/longitude, and KDTree for each variable.
    """
    era5_data = {}
    for var_name, file_path in era5_files.items():
        with nc.Dataset(file_path) as ds:
            # Load data, time, latitude, and longitude
            data_array = ds.variables[list(ds.variables.keys())[-1]][:]
            valid_time = ds.variables['time'][:]
            latitudes = ds.variables['latitude'][:]
            longitudes = ds.variables['longitude'][:]

            # Adjust for 4D arrays (time, level, lat, lon)
            if len(data_array.shape) == 4:  # Time, Level, Lat, Lon
                data_array = data_array[:, 0, :, :]  # Use the first level

            # Create KDTree for spatial lookup
            lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
            tree = cKDTree(lat_lon_pairs)

            # Store data in dictionary
            era5_data[var_name] = {
                'data': data_array,
                'time': valid_time,
                'latitudes': latitudes,
                'longitudes': longitudes,
                'tree': tree
            }
    return era5_data

era5_files = {
    '10m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_u_component_of_wind_2023.nc',
    '10m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_v_component_of_wind_2023.nc',
    'mean_wave_direction': '../data/raw/reanalyses/ERA5/era5_mean_wave_direction_2023.nc',
    'mean_wave_period': '../data/raw/reanalyses/ERA5/era5_mean_wave_period_2023.nc',
    'significant_height_of_combined_wind_waves_and_swell': '../data/raw/reanalyses/ERA5/era5_significant_height_of_combined_wind_waves_and_swell_2023.nc',
    '100m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_u_component_of_wind_2023.nc',
    '100m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_v_component_of_wind_2023.nc',
    'model_bathymetry': '../data/raw/reanalyses/ERA5/era5_model_bathymetry_2023.nc',
    'sea_ice_cover': '../data/raw/reanalyses/ERA5/era5_sea_ice_cover_2023.nc'
}

# Preload data
era5_data = load_era5_data(era5_files)

Function to extract ERA5 data from a given latitude and longitude

In [None]:
# Function to extract data from preloaded ERA5 arrays
def extract_era5_data(lat, lon, dt, tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs, era5_data):
    """
    Extracts ERA5 variable values for a given lat, lon, and datetime.

    Args:
        lat (float): Latitude of the point.
        lon (float): Longitude of the point.
        dt (datetime): Datetime object for the point.
        tree (cKDTree): KDTree for spatial lookup.
        valid_time_dt (list[datetime]): List of valid datetime objects corresponding to ERA5 time.
        latitudes (np.ndarray): Array of latitude values.
        longitudes (np.ndarray): Array of longitude values.
        lat_lon_pairs (np.ndarray): Array of (lat, lon) pairs.
        era5_data (dict): Dictionary with preloaded ERA5 data arrays.

    Returns:
        dict: A dictionary with variable names as keys and extracted values as values.
    """
    extracted_data = {}

    # Find the closest time index
    time_diffs = [abs((dt - vt).total_seconds()) for vt in valid_time_dt]
    closest_time_index = np.argmin(time_diffs)

    if closest_time_index < 0 or closest_time_index >= len(valid_time_dt):
        return {var_name: np.nan for var_name in era5_data.keys()}

    # Find the closest spatial index
    _, closest_point_index = tree.query((lat, lon))
    closest_lat = latitudes[closest_point_index // len(longitudes)]
    closest_lon = longitudes[closest_point_index % len(longitudes)]

    lat_index = np.where(latitudes == closest_lat)[0][0]
    lon_index = np.where(longitudes == closest_lon)[0][0]

    # Extract data for each variable
    for var_name, data in era5_data.items():
        data_array = data['data']
        extracted_data[var_name] = data_array[closest_time_index, lat_index, lon_index]

    return extracted_data

Function to calculate new position from current position, displacement, and heading

In [None]:
# Import the math module
import math

# Redefine the calculate_new_position function with wrapping logic
def calculate_new_position(current_position, displacement, heading):
    R = 6371000  # Earth's radius in meters
    
    # Convert inputs to radians
    lat1 = math.radians(current_position[0])
    lon1 = math.radians(current_position[1])
    heading_rad = math.radians(heading)
    
    # Compute new latitude
    lat2 = math.asin(math.sin(lat1) * math.cos(displacement / R) +
                     math.cos(lat1) * math.sin(displacement / R) * math.cos(heading_rad))
    
    # Compute new longitude
    lon2 = lon1 + math.atan2(math.sin(heading_rad) * math.sin(displacement / R) * math.cos(lat1),
                             math.cos(displacement / R) - math.sin(lat1) * math.sin(lat2))
    
    # Convert back to degrees
    new_lat = math.degrees(lat2)
    new_lon = math.degrees(lon2)
    
    # Wrap longitude to [-180, 180]
    if new_lon > 180:
        new_lon -= 360
    elif new_lon < -180:
        new_lon += 360
    
    return new_lat, new_lon


Iterative predictor function

In [None]:
def iterative_prediction(val_data, model, tree, valid_times, latitudes, longitudes, lat_lon_pairs, era5_data):
    """
    Predicts buoy motion iteratively using a machine learning model and ERA5 data.

    Args:
        val_data (pd.DataFrame): Validation dataset with columns ['BuoyID', 'Latitude', 'Longitude', 'datetime'].
        model: Trained machine learning model for prediction.
        tree (cKDTree): KDTree for spatial lookup of ERA5 data.
        valid_times (list): List of valid datetime objects for ERA5 time.
        latitudes (np.ndarray): Array of ERA5 latitudes.
        longitudes (np.ndarray): Array of ERA5 longitudes.
        lat_lon_pairs (np.ndarray): Array of (latitude, longitude) pairs for spatial lookup.
        era5_data (dict): Dictionary with preloaded ERA5 data arrays.

    Returns:
        np.ndarray: Array of predictions for all buoys.
    """
    # Add a time_to_next_position column to val_data
    val_data = val_data.sort_values(by=['BuoyID', 'datetime']).reset_index(drop=True)
    val_data['time_to_next_position'] = val_data.groupby('BuoyID')['datetime'].diff().dt.total_seconds().fillna(0)

    # Initialize an empty list to store predictions for all buoys
    all_predictions = []

    # Iterate over each unique BuoyID
    unique_buoy_ids = val_data['BuoyID'].unique()
    for buoy_id in unique_buoy_ids:
        buoy_data = val_data[val_data['BuoyID'] == buoy_id]

        # Initialize an empty list to store predictions for the current buoy
        predictions = []

        # Extract initial conditions for the current buoy
        current_lat, current_lon = buoy_data.iloc[0][['Latitude', 'Longitude']]

        # Initialize ERA5 variables for the first row
        current_era5_values = {}
        for var_name in era5_data.keys():
            current_era5_values[var_name] = extract_era5_data(
                current_lat,
                current_lon,
                buoy_data.iloc[0]['datetime'],
                tree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs,
                era5_data
            )[var_name]

        # Add the initial condition as the first prediction
        predictions.append([current_lat, current_lon, buoy_data.iloc[0]['datetime']])

        for i in range(1, len(buoy_data)):
            next_row = buoy_data.iloc[i]
            time_to_next_position = next_row['time_to_next_position']

            # Prepare input data with all ERA5 variables
            input_data = {"Latitude": [current_lat], "Longitude": [current_lon], "time_to_next_position": [time_to_next_position]}
            for var_name, value in current_era5_values.items():
                input_data[var_name] = [value]
            input_data = pd.DataFrame(input_data)

            # Make prediction for displacement and heading
            predicted_displacement, predicted_heading = model.predict(input_data)[0]

            # Calculate new position based on displacement and heading
            predicted_lat, predicted_lon = calculate_new_position(
                (current_lat, current_lon),
                predicted_displacement,
                predicted_heading
            )

            # Extract all ERA5 data for the predicted position and time
            predicted_era5_values = extract_era5_data(
                predicted_lat,
                predicted_lon,
                next_row['datetime'],
                tree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs,
                era5_data
            )

            # Append the prediction for the current buoy
            predictions.append([predicted_lat, predicted_lon, next_row['datetime']])

            # Update current state for the next iteration
            current_lat, current_lon = predicted_lat, predicted_lon
            current_era5_values = predicted_era5_values

        # Append predictions of the current buoy to all_predictions
        all_predictions.extend(predictions)

    # Convert all predictions to a NumPy array before returning
    all_predictions_array = np.array(all_predictions, dtype=object)
    return all_predictions_array

Training data setup

In [None]:
# Load the data from the spreadsheet
buoy_data = pd.read_csv('../data/ai_ready/buoydata/combined_buoy_data_IABP2023.csv')

# Convert 'datetime' column to datetime type
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])

# Explicitly list columns to use for X
columns_for_X = [
    'Latitude',
    'Longitude',
    'era5_10m_uwnd',
    'era5_10m_vwnd',
    'era5_100m_uwnd',
    'era5_100m_vwnd',
    'BuoyID',
    'datetime'
]

# Define features and targets
X = buoy_data[columns_for_X]
y = buoy_data[['displacement', 'heading']]
groups = buoy_data['BuoyID']


Model selection

In [None]:
# Models to evaluate
model_configs = [
    ('ElasticNet', MultiOutputRegressor(ElasticNet(alpha=1.0, l1_ratio=0.5))),
    ('GradientBoosting', MultiOutputRegressor(GradientBoostingRegressor(n_estimators=100, max_depth=5))),
    ('RandomForest', RandomForestRegressor(n_estimators=100, max_depth=10)),
    ('XGBoost', MultiOutputRegressor(XGBRegressor(n_estimators=100, max_depth=6, objective='reg:squarederror'))),
    ('LightGBM', MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, max_depth=6, **lgb_params)))
]

# LightGBM verbosity suppression
lgb_params = {'verbose': -1}

# GroupKFold for cross-validation
cv_folds = 5
group_kf = GroupKFold(n_splits=cv_folds)

# Ensure the predictions directory exists
predictions_dir = '../data/processed/predictions'
os.makedirs(predictions_dir, exist_ok=True)

# Initialize DataFrame to store results
results = []

# Cross-validation
for model_name, model in model_configs:
    print(f"\nTesting model: {model_name}")
    model_scores = []  # To store RMSE for each fold
    fold_times = []  # To store time taken for each fold

    for fold_num, (train_index, val_index) in enumerate(group_kf.split(X, y, groups=groups)):
        print(f"\nFold {fold_num + 1}")
        start_time = time.time()

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Retain 'BuoyID' in X_val for iteration step
        X_val_with_buoyid = X_val.copy()
        X_train = X_train.drop(columns=['BuoyID', 'datetime'])
        X_val = X_val.drop(columns=['BuoyID', 'datetime'])

        # Train the model
        model.fit(X_train, y_train)

        # Predict iteratively
        y_pred = iterative_prediction(
            val_data=X_val_with_buoyid,
            model=model,
            tree=tree,
            valid_times=valid_time_dt,
            latitudes=latitudes,
            longitudes=longitudes,
            lat_lon_pairs=lat_lon_pairs
        )

        # Convert predictions to a DataFrame for easier handling
        y_pred = pd.DataFrame(y_pred, columns=['Latitude', 'Longitude', 'datetime'])

        # Exclude the datetime column for RMSE calculation and ensure numeric dtype
        y_pred_numeric = np.array(y_pred[['Latitude', 'Longitude']].to_numpy(), dtype=np.float64)

        # Ensure y_val is in the same format
        y_val_numeric = y_val.to_numpy()

        # Calculate RMSE
        try:
            rmse = np.sqrt(mean_squared_error(y_val_numeric, y_pred_numeric))
            model_scores.append(rmse)
            print(f"Fold {fold_num + 1} RMSE: {rmse:.3f}")
        except ValueError as e:
            print(f"Error calculating RMSE: {e}")
            continue

        # Record time taken for the fold
        fold_time = time.time() - start_time
        fold_times.append(fold_time)
        print(f"Fold {fold_num + 1} time: {fold_time:.2f} seconds")

        # Save predictions and true values to CSV
        predictions_df = pd.DataFrame({
            'BuoyID': X_val_with_buoyid['BuoyID'].values,  # Add BuoyID to the output
            'True Latitude': X_val_with_buoyid['Latitude'].values,  # Use latitude from X_val_with_buoyid
            'True Longitude': X_val_with_buoyid['Longitude'].values,  # Use longitude from X_val_with_buoyid
            'Predicted Latitude': np.round(y_pred_numeric[:, 0], 4),  # Predicted latitude rounded to 4 decimal places
            'Predicted Longitude': np.round(y_pred_numeric[:, 1], 4)  # Predicted longitude rounded to 4 decimal places
        })
        predictions_file = os.path.join(predictions_dir, f"{model_name}_fold{fold_num + 1}_predictions.csv")
        predictions_df.to_csv(predictions_file, index=False)

    # Store results for this model
    mean_rmse = np.mean(model_scores)
    std_rmse = np.std(model_scores)
    total_time = sum(fold_times)

    results.append({
        'Model': model_name,
        'Mean RMSE': mean_rmse,
        'RMSE StdDev': std_rmse,
        'Total Time (s)': total_time,
        'Mean Time per Fold (s)': np.mean(fold_times)
    })

    print(f"\nCompleted cross-validation for {model_name}. "
          f"Mean RMSE: {mean_rmse:.3f}, Std. Dev: {std_rmse:.3f}, Total Time: {total_time:.2f} seconds")

# Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_comparison_results.csv', index=False)

# Identify the best model based on mean RMSE
best_model_row = results_df.loc[results_df['Mean RMSE'].idxmin()]
print(f"\n=== Best model selected: {best_model_row['Model']} ===")
print(f"Mean RMSE: {best_model_row['Mean RMSE']:.3f}, Total Time: {best_model_row['Total Time (s)']:.2f} seconds")

# Store the best model
best_model = model_configs[results_df['Mean RMSE'].idxmin()][1]

print(f"Best model: {best_model_row['Model']}")
print(f"Mean RMSE: {best_model_row['Mean RMSE']:.3f}")
print(f"Total Time: {best_model_row['Total Time (s)']:.2f} seconds")

Hyperparameter tuning on the best model with Optuna

In [None]:
import optuna
from optuna.pruners import MedianPruner
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import lightgbm as lgb
import numpy as np
import gc

# Define the objective function for hyperparameter tuning
def objective(trial):
    print(f"Starting trial {trial.number}...")  # Track the start of each trial

    if best_model_row['Model'] == 'ElasticNet':
        alpha = trial.suggest_float('alpha', 0.1, 10.0, log=True)
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
        model = MultiOutputRegressor(ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
    elif best_model_row['Model'] == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth))
    elif best_model_row['Model'] == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 5, 15)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif best_model_row['Model'] == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        model = MultiOutputRegressor(XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, objective='reg:squarederror'))
    elif best_model_row['Model'] == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        model = MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate))

    # Cross-validation logic
    model_scores = []
    for fold, (train_index, val_index) in enumerate(group_kf.split(X, y, groups=groups)):
        print(f"  Processing fold {fold + 1}...")  # Track folds
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Convert to numpy arrays and reduce precision
        X_train = X_train.drop(columns=['BuoyID', 'datetime']).to_numpy(dtype='float32')
        X_val = X_val.drop(columns=['BuoyID', 'datetime']).to_numpy(dtype='float32')
        y_train = y_train.to_numpy(dtype='float32')
        y_val = y_val.to_numpy(dtype='float32')

        # Train the model
        model.fit(X_train, y_train)

        # Iterative prediction
        y_pred = iterative_prediction(
            val_data=X.iloc[val_index],
            model=model,
            tree=tree,
            valid_times=valid_time_dt,
            latitudes=latitudes,
            longitudes=longitudes,
            lat_lon_pairs=lat_lon_pairs
        )

        # Filter out datetime column from predictions
        y_pred_filtered = y_pred[:, :2]  # Keep only Longitude and Latitude

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_filtered))
        model_scores.append(rmse)
        print(f"    Fold {fold + 1} RMSE: {rmse:.4f}")  # Track RMSE per fold

        # Free memory after each fold
        del X_train, X_val, y_train, y_val, y_pred, y_pred_filtered
        gc.collect()

    trial_score = np.mean(model_scores)
    print(f"Trial {trial.number} completed with mean RMSE: {trial_score:.4f}")  # Track trial completion
    return trial_score

# Create an Optuna study with pruning and enable parallel execution
study = optuna.create_study(direction='minimize', pruner=MedianPruner())

# Perform optimization with parallel processing
print("Starting hyperparameter tuning...")
study.optimize(objective, n_trials=10, n_jobs=-1)  # n_jobs=-1 utilizes all available CPU cores

# Retrieve the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

Making predictions with the best tuned model and saving the results for evaluation

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
from geopy.distance import geodesic
import time

# Randomly select 5 buoys for validation
print("Selecting 5 random buoys for validation...")
np.random.seed(42)  # Set seed for reproducibility
validation_buoys = np.random.choice(X['BuoyID'].unique(), size=5, replace=False)
print(f"Selected validation buoys: {validation_buoys}")

# Split the data into training and validation based on BuoyID
print("Splitting data into training and validation sets...")
train_data = X[~X['BuoyID'].isin(validation_buoys)].copy()
val_data = X[X['BuoyID'].isin(validation_buoys)].copy()

# Ensure y is aligned with the indices of X
y_train = y.loc[train_data.index]
y_val = y.loc[val_data.index]

print(f"Training data size: {train_data.shape[0]} rows")
print(f"Validation data size: {val_data.shape[0]} rows")

# Drop 'BuoyID' and 'datetime' for training
print("Dropping unnecessary columns ('BuoyID', 'datetime') from training and validation sets...")
X_train_clean = train_data.drop(columns=['BuoyID', 'datetime'])
X_val_clean = val_data.drop(columns=['BuoyID', 'datetime'])

# Instantiate the model using best_params
print("Instantiating the model with the best parameters...")
if best_model_row['Model'] == 'ElasticNet':
    best_model = MultiOutputRegressor(ElasticNet(**best_params))
elif best_model_row['Model'] == 'GradientBoosting':
    best_model = MultiOutputRegressor(GradientBoostingRegressor(**best_params))
elif best_model_row['Model'] == 'RandomForest':
    best_model = RandomForestRegressor(**best_params)
elif best_model_row['Model'] == 'XGBoost':
    best_model = MultiOutputRegressor(XGBRegressor(**best_params, objective='reg:squarederror'))
elif best_model_row['Model'] == 'LightGBM':
    best_model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))

# Train the tuned model on the training data
print("Training the model on the training data...")
best_model.fit(X_train_clean, y_train)
print("Model training completed.")

# Use the iterative_prediction function for evaluation on validation buoys
print("Generating predictions for validation buoys...")
y_pred = iterative_prediction(
    val_data=val_data,
    model=best_model,
    tree=tree,
    valid_times=valid_time_dt,
    latitudes=latitudes,
    longitudes=longitudes,
    lat_lon_pairs=lat_lon_pairs
)
print("Predictions generated successfully.")

# Convert predictions to a DataFrame and include BuoyID and Datetime
print("Preparing predictions DataFrame...")
try:
    # Extract only latitude and longitude columns from y_pred
    pred_lat_lon = y_pred[:, :2]

    # Ensure the extracted data is a NumPy array of float type
    pred_lat_lon = np.array(pred_lat_lon, dtype=np.float64)

    # Round the latitude and longitude values to 3 decimal places
    pred_lat_lon = np.round(pred_lat_lon, 3)

    # Create the predictions DataFrame
    y_pred_df = pd.DataFrame(
        pred_lat_lon, columns=['Predicted Latitude', 'Predicted Longitude']
    )

    # Add metadata columns (BuoyID and Datetime)
    y_pred_df['BuoyID'] = val_data['BuoyID'].values
    y_pred_df['Datetime'] = val_data['datetime'].values

except Exception as e:
    print(f"Error during DataFrame creation: {e}")
    print(f"y_pred shape: {y_pred.shape}, y_pred content (first rows): {y_pred[:5]}")
    raise

# Calculate evaluation metrics
print("Calculating evaluation metrics...")

# Ensure valid arrays for true and predicted values
true_lat_lon = np.array(val_data[['Latitude', 'Longitude']].values, dtype=np.float64)
pred_lat_lon = np.array(y_pred_df[['Predicted Latitude', 'Predicted Longitude']].values, dtype=np.float64)

# Safeguard against scalar values
if true_lat_lon.ndim != 2 or pred_lat_lon.ndim != 2:
    raise ValueError(f"Expected 2D arrays for latitude/longitude, got shapes: "
                     f"true_lat_lon: {true_lat_lon.shape}, pred_lat_lon: {pred_lat_lon.shape}")

# Calculate metrics
lat_lon_rmse = np.sqrt(mean_squared_error(true_lat_lon, pred_lat_lon))
lat_lon_mae = mean_absolute_error(true_lat_lon, pred_lat_lon)
lat_lon_median_ae = median_absolute_error(true_lat_lon, pred_lat_lon)

# Haversine Distance
haversine_distances = [
    geodesic(true, pred).meters for true, pred in zip(true_lat_lon, pred_lat_lon)
]
mean_haversine_distance = np.mean(haversine_distances)
median_haversine_distance = np.median(haversine_distances)

print(f"Validation Latitude/Longitude RMSE: {lat_lon_rmse:.3f}")
print(f"Validation Latitude/Longitude MAE: {lat_lon_mae:.3f}")
print(f"Validation Latitude/Longitude Median AE: {lat_lon_median_ae:.3f}")
print(f"Mean Haversine Distance: {mean_haversine_distance:.3f} meters")
print(f"Median Haversine Distance: {median_haversine_distance:.3f} meters")

# Save predictions for analysis
print("Saving predictions to a CSV file...")
predictions_df = pd.DataFrame({
    'BuoyID': val_data['BuoyID'].values,
    'Datetime': val_data['datetime'].values,
    'True Latitude': np.round(val_data['Latitude'].values, 3),
    'True Longitude': np.round(val_data['Longitude'].values, 3),
    'Predicted Latitude': y_pred_df['Predicted Latitude'].values,
    'Predicted Longitude': y_pred_df['Predicted Longitude'].values,
    'Haversine Distance (m)': haversine_distances  # Include Haversine distance for each prediction
})

# Ensure the output directory exists
predictions_file = '../data/processed/predictions/bestmodel_predictions.csv'
os.makedirs(os.path.dirname(predictions_file), exist_ok=True)
predictions_df.to_csv(predictions_file, index=False)
print(f"Predictions saved successfully to: {predictions_file}")