# Training Gradient Boosted Trees

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
import math

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
base_dir = os.path.dirname(parent_dir)

src_dir = base_dir + "/src"

sys.path.insert(0, src_dir)


from calibration import DirectGBT
from data_analysis import plot_positions

In [None]:
###################################
########### Which eMNS? ###########
###################################

emns = "octomag" # "octomag" or "navion"
if emns == "octomag":
    data_dir = base_dir + "/data/octomag_data/split_dataset/"
    store_model_dir = cwd + "/trees/"
if emns == "navion":
    data_dir = base_dir + "/data/navion_data/split_dataset/"
    store_model_dir = cwd + "/navion_trees/"

## Load data

In [None]:
dataset_percentage = 100 # percent of the training/validation dataset to be used

In [None]:
training_data_name = data_dir + [f"training_data_{dataset_percentage}.pkl" if dataset_percentage != 100 else "training_data.pkl"][0]
validation_data_name = data_dir + [f"validation_data_{dataset_percentage}.pkl" if dataset_percentage != 100 else "validation_data.pkl"][0]
training_data = pd.read_pickle(training_data_name)
validation_data = pd.read_pickle(validation_data_name)
test_data = pd.read_pickle(data_dir + "test_data.pkl")

em_cols = [col for col in training_data.columns if col.startswith("em_")]

### Visualize data

In [None]:
plot_3d = True
plot_distributions = True

In [None]:
if plot_3d:
    plot_positions(training_data, title="Training Data Positions Distribution")

In [None]:
if plot_3d:
    plot_positions(validation_data, title="Validation Data Positions Distribution")

In [None]:
if plot_3d:
    plot_positions(test_data, title="Test Data Positions Distribution")

In [None]:
if plot_distributions:
    features_to_plot = ["Bx", "By", "Bz", "x", "y", "z"]

    n = len(features_to_plot)
    ncols = 3
    nrows = math.ceil(n / ncols)   # will be 2

    fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 3.5*nrows), squeeze=False)
    axes = axes.ravel()

    for i, feature in enumerate(features_to_plot):
        ax = axes[i]
        ax.hist(training_data[feature], bins=50, alpha=0.5, label="Train", density=True)
        ax.hist(validation_data[feature], bins=50, alpha=0.5, label="Val", density=True)
        ax.hist(test_data[feature], bins=50, alpha=0.5, label="Test", density=True)
        ax.set_title(feature)
        ax.set_xlabel(feature)
        ax.set_ylabel("Density")

    # shared legend
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper right", frameon=True)

    fig.suptitle("Distributions (Train / Val / Test)", y=1.02)
    fig.tight_layout()
    plt.show()

## Parameters for training

In [None]:
# Should we normalize?
normalize_targets = True
normalize_position = False

# GBT hyperparameters
n_estimators = 5000
learning_rate = 0.1
num_leaves = 128
min_child_samples = 20
subsample = 1.0
colsample_bytree = 1.0
random_state = 2

# Early stopping
early_stopping_patience = 200


# Bothering
verbose = True

## Create and train model

In [None]:
model = DirectGBT(
    name = "DirectGBT_" + str(dataset_percentage) + "_" + str(num_leaves),
    current_names = em_cols
)

model.train(
    # Data
    train_df = training_data,
    val_df = validation_data,
    
    # Normalize
    normalize_targets = normalize_targets,
    normalize_position = normalize_position,

    # Hyperparameters
    n_estimators = n_estimators,
    learning_rate = learning_rate,
    num_leaves = num_leaves,
    min_child_samples = min_child_samples,
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    random_state = random_state,

    # Early stopping
    early_stopping_patience = early_stopping_patience,
    
    # Yap
    verbose = verbose
)

## Test trained model

### Test dataset

In [None]:
# Test on test set
position_test = test_data[["x", "y", "z"]].to_numpy()
currents_test = test_data[em_cols].to_numpy()
targets_test  = test_data[["Bx", "By", "Bz"]].to_numpy()

predictions_test = model.predict_targets(position=position_test, currents=currents_test)

# Error matrix
error = predictions_test - targets_test

# Overall error norm (Frobenius)
rse = np.linalg.norm(error, axis=1)            # scalar

# Per-sample magnitudes
mag_t = np.linalg.norm(targets_test, axis=1)        # (N,)
mag_p = np.linalg.norm(predictions_test, axis=1)    # (N,)

# Relative magnitude error (per-sample), safe for mag_t == 0
eps = 1e-3
magnitude_rel_error = np.abs((mag_p - mag_t) / np.maximum(mag_t, eps))  # (N,)

# Angle error (per-sample), safe for zero magnitudes
dot = np.sum(predictions_test * targets_test, axis=1)                  # (N,)
cosang = dot / (np.maximum(mag_p, eps) * np.maximum(mag_t, eps))        # (N,)
angle_error = np.arccos(np.clip(cosang, -1.0, 1.0))                     # radians (N,)

# Optional: degrees
angle_error_deg = np.degrees(angle_error)

In [None]:
# Plot error distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Box plot for RSE, Magnitude Relative Error, Angle Error
axes[0].boxplot(rse, vert=True, showfliers=False)
axes[0].set_title("RSE Distribution")
axes[0].set_ylabel("RSE (T)")
axes[1].boxplot(magnitude_rel_error, vert=True, showfliers=False)
axes[1].set_title("Magnitude Relative Error Distribution")
axes[1].set_ylabel("Relative Error")
axes[2].boxplot(angle_error_deg, vert=True, showfliers=False)
axes[2].set_title("Angle Error Distribution")   
axes[2].set_ylabel("Angle Error (degrees)")
fig.suptitle("Test Set Error Distributions")
plt.show()

### Training dataset

In [None]:
# Test on test set
position_train = training_data[["x", "y", "z"]].to_numpy()
currents_train = training_data[em_cols].to_numpy()
targets_train  = training_data[["Bx", "By", "Bz"]].to_numpy()

predictions_train = model.predict_targets(position=position_train, currents=currents_train)

# Error matrix
error = predictions_train - targets_train

# Overall error norm (Frobenius)
rse = np.linalg.norm(error, axis=1)            # scalar

# Per-sample magnitudes
mag_t = np.linalg.norm(targets_train, axis=1)        # (N,)
mag_p = np.linalg.norm(predictions_train, axis=1)    # (N,)

# Relative magnitude error (per-sample), safe for mag_t == 0
eps = 1e-3
magnitude_rel_error = np.abs((mag_p - mag_t) / np.maximum(mag_t, eps))  # (N,)

# Angle error (per-sample), safe for zero magnitudes
dot = np.sum(predictions_train * targets_train, axis=1)                  # (N,)
cosang = dot / (np.maximum(mag_p, eps) * np.maximum(mag_t, eps))        # (N,)
angle_error = np.arccos(np.clip(cosang, -1.0, 1.0))                     # radians (N,)

# Optiondegrees
angle_error_deg = np.degrees(angle_error)

### Compare training set performance with test set performance

In [None]:
def eval_df(df, em_cols, model, eps=1e-9):
    """
    Returns per-sample metrics + scalar RMSE for one dataframe.
    Assumes df has columns: x,y,z, em_*, Bx,By,Bz
    """
    pos = df[["x", "y", "z"]].to_numpy()
    cur = df[em_cols].to_numpy()
    tgt = df[["Bx", "By", "Bz"]].to_numpy()

    pred = model.predict_targets(position=pos, currents=cur)

    err = pred - tgt
    rse = np.linalg.norm(err, axis=1)  # (N,) |error| in field units

    mag_t = np.linalg.norm(tgt, axis=1)
    mag_p = np.linalg.norm(pred, axis=1)

    mag_rel_err = np.abs(mag_p - mag_t) / np.maximum(mag_t, eps)

    dot = np.sum(pred * tgt, axis=1)
    cosang = dot / (np.maximum(mag_p, eps) * np.maximum(mag_t, eps))
    angle_err = np.arccos(np.clip(cosang, -1.0, 1.0))
    angle_err_deg = np.degrees(angle_err)

    rmse = np.sqrt(np.mean(rse**2))

    return {
        "pred": pred,
        "tgt": tgt,
        "err": err,
        "rse": rse,
        "mag_rel_err": mag_rel_err,
        "angle_err_deg": angle_err_deg,
        "rmse": rmse,
    }

# --- Evaluate TRAIN vs TEST (fixes your bug: train uses training_data, not test_data) ---
print("Eval train data")
train_metrics = eval_df(training_data, em_cols, model, eps=1e-9)
print("Eval test data")
test_metrics  = eval_df(test_data,     em_cols, model, eps=1e-9)

print(f"Train RMSE: {train_metrics['rmse']:.6g}")
print(f"Test  RMSE: {test_metrics['rmse']:.6g}")

# --- Plot: boxplot of RSE (left) + RMSE bars (right) ---
fig, axes = plt.subplots(1, 2, figsize=(8.0, 3.2), constrained_layout=True)

# Left: RSE distributions
axes[0].boxplot(
    [train_metrics["rse"], test_metrics["rse"]],
    labels=["Train", "Test"],
    showfliers=False
)
axes[0].set_title("RSE distribution")
axes[0].set_ylabel("RSE (field units)")  # change to mT or T if you know for sure

# Right: RMSE bars
axes[1].bar(["Train", "Test"], [train_metrics["rmse"], test_metrics["rmse"]])
axes[1].set_title("RMSE")
axes[1].set_ylabel("RMSE (field units)")

plt.show()

# Store the model

In [None]:
model.save(store_model_dir)