In [14]:
import GPy
import numpy as np
import matplotlib.pyplot as plt
import utils
from functools import partial
from utils import KernelFunction, KernelEnvironment, log_likelihood_reward
from utils import plot_kernel_function, compare_kernels
from gflownet import GFlowNet 
import torch.nn.functional as F
from torch.distributions import Categorical
import torch
from utils import ForwardPolicy, BackwardPolicy
import random
from utils import train

from evaluation import create_random_kernel
import itertools
from functools import partial

from evaluation import calculate_l1_distance

import pandas as pd

from copy import deepcopy

from evaluation import calculate_rmse
from automated_statistician import greedy_search


In [None]:
def create_env(batch_size=64):
    return KernelEnvironment(
    batch_size=batch_size,
    max_trajectory_length=MAX_LEN,
    log_reward=log_reward_fn
)

In [6]:
ll = -1#
while ll < 0:
    true_kernel = create_random_kernel()
    X, Y, true_kernel_str = utils.generate_gp_data(true_kernel, input_dim=1, n_points=30, noise_var=1e-4)
    X_test, Y_test, _ = utils.generate_gp_data(true_kernel, input_dim=1, n_points=30, noise_var=1e-4)
    ll = utils.evaluate_likelihood(true_kernel, X, Y, runtime=False)

print("True Kernel:", true_kernel_str, "Log Marginal Likelihood:", utils.evaluate_likelihood(true_kernel, X, Y, runtime=False))
print("True Kernel:", true_kernel_str, "Log Marginal Likelihood:", utils.evaluate_likelihood(true_kernel, X_test, Y_test, runtime=False))

#lt.scatter(X, Y, color='black', s=10, label='Data Points')

  -> Randomizing 'RBF' params...
  -> Randomizing 'RBF' params...
  -> Randomizing 'Constant' params...
True Kernel: ((RBF({'lengthscale': 1.099, 'variance': 1.155}) + RBF({'lengthscale': 1.429, 'variance': 0.917})) + Constant({'variance': 1.261})) Log Marginal Likelihood: 41.07869778729621
True Kernel: ((RBF({'lengthscale': 1.099, 'variance': 1.155}) + RBF({'lengthscale': 1.429, 'variance': 0.917})) + Constant({'variance': 1.261})) Log Marginal Likelihood: 46.49716139384765


In [7]:
log_reward_fn = partial(log_likelihood_reward, X, Y)

In [8]:
# --- 1. Define the Hyperparameter Grid ---
param_grid = {
    'lr': [1e-4, 1e-3, 1e-2],
    'BATCH_SIZE': [16, 64, 256],
    'criterion': ['db', 'tb', 'subtb', 'cb'],
    'epsilon': [0.5],  # Initial epsilon for the forward policy
    'min_eps': [1e-2],   # Minimum epsilon for the scheduler
    'clamp_g': [1.0] # Clamping value for the gradient
}

# --- 2. Prepare for Iteration ---
# Generate a list of all hyperparameter combinations.
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

# Initialize a dictionary to store the best result for each criterion
best_results_per_criterion = {
    crit: {'best_l1': float('inf'), 'best_params': None}
    for crit in param_grid['criterion']
}

# --- 3. Run the Grid Search ---
print(f"Starting grid search with {len(param_combinations)} combinations...")

for i, params in enumerate(param_combinations):
    print(f"\n--- Combination {i+1}/{len(param_combinations)} ---")
    print(f"Parameters: {params}")

    # Your fixed parameters
    epochs = 100
    MAX_LEN = 4

    # Unpack current combination of parameters
    lr = params['lr']
    BATCH_SIZE = params['BATCH_SIZE']
    criterion = params['criterion']
    initial_epsilon = params['epsilon']
    min_eps = params['min_eps']
    clamp_g = params['clamp_g']

    # --- Model Initialization ---
    # log_reward_fn = partial(utils.log_likelihood_reward, X, Y)
    env = create_env()

    forward_model = ForwardPolicy(
        input_dim=MAX_LEN,
        output_dim=env.action_space_size,
        epsilon=initial_epsilon
    )
    backward_model = BackwardPolicy()

    gflownet = GFlowNet(
        forward_flow=forward_model,
        backward_flow=backward_model,
        criterion=criterion
    )
    
    # Add attributes to model for mock calculation
    gflownet.lr = lr
    
    # --- Training ---
    trained_gflownet, losses = train(
        gflownet=gflownet,
        create_env=create_env,
        epochs=epochs,
        batch_size=BATCH_SIZE,
        lr=lr,
        min_eps=min_eps,
        clamp_g=clamp_g,
        use_scheduler=True
    )

    # --- Evaluation and Logging ---
    l1 = calculate_l1_distance(gflownet.forward_flow , KernelEnvironment , MAX_LEN, X, Y)
    print(f"    => Result: L1 distance = {l1:.4f}")

    # Check if this is the new best result *for this specific criterion*
    if l1 < best_results_per_criterion[criterion]['best_l1']:
        print(f"    ✨ New best L1 for criterion '{criterion}': {l1:.4f} ✨")
        best_results_per_criterion[criterion]['best_l1'] = l1
        best_results_per_criterion[criterion]['best_params'] = params
        best_results_per_criterion[criterion]['model'] = deepcopy(trained_gflownet)


# --- 4. Final Results Table ---
print("\n--- Grid Search Complete ---")
print("Best results per criterion:")

# Prepare data for the pandas DataFrame
table_data = []
for criterion, results in best_results_per_criterion.items():
    if results['best_params']:  # Check if any result was found
        row = {
            'Criterion': criterion,
            'Best L1 Distance': f"{results['best_l1']:.4f}",
            'Learning Rate': results['best_params']['lr'],
            'Batch Size': results['best_params']['BATCH_SIZE'],
            'Clamp Value': results['best_params']['clamp_g']
        }
        table_data.append(row)

if table_data:
    # Create and display the DataFrame
    df = pd.DataFrame(table_data)
    print(df.to_string(index=False))
else:
    print("No results were recorded.")

Starting grid search with 36 combinations...

--- Combination 1/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 16, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:47<00:00,  2.08it/s, loss=22.3]


    => Result: L1 distance = 0.8366
    ✨ New best L1 for criterion 'db': 0.8366 ✨

--- Combination 2/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 16, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:02<00:00, 45.26it/s, loss=37.8]


    => Result: L1 distance = 0.9660
    ✨ New best L1 for criterion 'tb': 0.9660 ✨

--- Combination 3/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 16, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:02<00:00, 49.69it/s, loss=75.5]


    => Result: L1 distance = 0.8383
    ✨ New best L1 for criterion 'subtb': 0.8383 ✨

--- Combination 4/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 16, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 54.15it/s, loss=42.5]


    => Result: L1 distance = 0.8712
    ✨ New best L1 for criterion 'cb': 0.8712 ✨

--- Combination 5/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 64, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:03<00:00, 25.95it/s, loss=19] 


    => Result: L1 distance = 0.9257

--- Combination 6/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 64, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:05<00:00, 18.34it/s, loss=28.4]


    => Result: L1 distance = 0.7166
    ✨ New best L1 for criterion 'tb': 0.7166 ✨

--- Combination 7/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 64, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:04<00:00, 24.15it/s, loss=91.8]


    => Result: L1 distance = 0.7853
    ✨ New best L1 for criterion 'subtb': 0.7853 ✨

--- Combination 8/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 64, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:05<00:00, 18.70it/s, loss=41.3]


    => Result: L1 distance = 0.8457
    ✨ New best L1 for criterion 'cb': 0.8457 ✨

--- Combination 9/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 256, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:11<00:00,  8.52it/s, loss=22.4]


    => Result: L1 distance = 0.8157
    ✨ New best L1 for criterion 'db': 0.8157 ✨

--- Combination 10/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 256, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:07<00:00, 13.51it/s, loss=39.8]


    => Result: L1 distance = 0.9682

--- Combination 11/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 256, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:07<00:00, 13.06it/s, loss=89.7]


    => Result: L1 distance = 0.7655
    ✨ New best L1 for criterion 'subtb': 0.7655 ✨

--- Combination 12/36 ---
Parameters: {'lr': 0.0001, 'BATCH_SIZE': 256, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:09<00:00, 10.04it/s, loss=37.5]


    => Result: L1 distance = 0.8549

--- Combination 13/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 16, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 53.27it/s, loss=20.5]


    => Result: L1 distance = 0.8242

--- Combination 14/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 16, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 58.57it/s, loss=13.3]


    => Result: L1 distance = 0.7508

--- Combination 15/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 16, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:02<00:00, 37.86it/s, loss=66.5]


    => Result: L1 distance = 1.0562

--- Combination 16/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 16, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 63.94it/s, loss=19] 


    => Result: L1 distance = 0.7428
    ✨ New best L1 for criterion 'cb': 0.7428 ✨

--- Combination 17/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 64, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:04<00:00, 21.74it/s, loss=16.9]


    => Result: L1 distance = 1.0331

--- Combination 18/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 64, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:04<00:00, 23.98it/s, loss=15.6]


    => Result: L1 distance = 0.7636

--- Combination 19/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 64, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:03<00:00, 25.14it/s, loss=80.9]


    => Result: L1 distance = 0.8853

--- Combination 20/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 64, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:02<00:00, 33.47it/s, loss=29] 


    => Result: L1 distance = 0.8729

--- Combination 21/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 256, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:10<00:00,  9.26it/s, loss=18.7]


    => Result: L1 distance = 0.9305

--- Combination 22/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 256, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:07<00:00, 13.78it/s, loss=14.8]


    => Result: L1 distance = 0.8046

--- Combination 23/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 256, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:06<00:00, 15.01it/s, loss=80.8]


    => Result: L1 distance = 0.9233

--- Combination 24/36 ---
Parameters: {'lr': 0.001, 'BATCH_SIZE': 256, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:08<00:00, 11.12it/s, loss=26.4]


    => Result: L1 distance = 0.7255
    ✨ New best L1 for criterion 'cb': 0.7255 ✨

--- Combination 25/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 16, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 64.52it/s, loss=5.01]


    => Result: L1 distance = 0.8428

--- Combination 26/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 16, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 66.55it/s, loss=1.17]


    => Result: L1 distance = 0.9138

--- Combination 27/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 16, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:02<00:00, 41.04it/s, loss=9.87]


    => Result: L1 distance = 0.8046

--- Combination 28/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 16, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:01<00:00, 54.90it/s, loss=9.16]


    => Result: L1 distance = 0.9097

--- Combination 29/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 64, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:04<00:00, 20.73it/s, loss=6.33]


    => Result: L1 distance = 0.7092
    ✨ New best L1 for criterion 'db': 0.7092 ✨

--- Combination 30/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 64, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:03<00:00, 32.02it/s, loss=2.95]


    => Result: L1 distance = 0.7978

--- Combination 31/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 64, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:03<00:00, 25.85it/s, loss=3.78]


    => Result: L1 distance = 0.7700

--- Combination 32/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 64, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:03<00:00, 28.08it/s, loss=9.06]


    => Result: L1 distance = 0.8658

--- Combination 33/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 256, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:07<00:00, 13.52it/s, loss=2.78]


    => Result: L1 distance = 1.0009

--- Combination 34/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 256, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:07<00:00, 13.32it/s, loss=2.15]


    => Result: L1 distance = 0.8787

--- Combination 35/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 256, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:08<00:00, 11.43it/s, loss=10.5]


    => Result: L1 distance = 0.7792

--- Combination 36/36 ---
Parameters: {'lr': 0.01, 'BATCH_SIZE': 256, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}


100%|██████████| 100/100 [00:10<00:00,  9.83it/s, loss=3.71]


    => Result: L1 distance = 0.8350

--- Grid Search Complete ---
Best results per criterion:
Criterion Best L1 Distance  Learning Rate  Batch Size  Clamp Value
       db           0.7092         0.0100          64          1.0
       tb           0.7166         0.0001          64          1.0
    subtb           0.7655         0.0001         256          1.0
       cb           0.7255         0.0010         256          1.0


In [7]:
if table_data:
    # Create and display the DataFrame
    df = pd.DataFrame(table_data)
    print(df.to_string(index=False))
else:
    print("No results were recorded.")

Criterion Best L1 Distance  Learning Rate  Batch Size  Clamp Value
       db           0.6132          0.001          16          1.0
       tb           0.5095          0.010          16          1.0
    subtb           0.5212          0.010          64          1.0
       cb           0.4756          0.010         256          1.0


In [11]:
# --- 5. Robust Evaluation: Re-train and Evaluate Multiple Times ---
print("\n" + "="*50)
print("--- Robust Evaluation from Best Hyperparameters ---")
print("="*50 + "\n")

N_ROBUSTNESS_RUNS = 3
final_stats_results = []

for criterion, results in best_results_per_criterion.items():
    best_params = results.get('best_params')
    if not best_params:
        print(f"Skipping robust evaluation for '{criterion}' as no best parameters were found.\n")
        continue

    print(f"Starting robust evaluation for criterion '{criterion}' with params: {best_params}")
    
    # Lists to store metrics from each of the N runs
    run_mean_likelihoods = []
    run_max_likelihoods = []
    run_l1_distances = []
    run_mean_rmses = []

    for run in range(N_ROBUSTNESS_RUNS):
        print(f"  -> Run {run + 1}/{N_ROBUSTNESS_RUNS}...")
        
        # Unpack the best parameters for this criterion
        lr = best_params['lr']
        BATCH_SIZE = best_params['BATCH_SIZE']
        
        # Initialize and train a new model from scratch
        env = create_env()
        forward_model = ForwardPolicy(input_dim=MAX_LEN, output_dim=env.action_space_size, epsilon=best_params['epsilon'])
        backward_model = BackwardPolicy()
        gflownet = GFlowNet(forward_flow=forward_model, backward_flow=backward_model, criterion=criterion)
        gflownet.lr = lr # For mock calculation
        
        trained_gflownet, _ = train(gflownet=gflownet, create_env=create_env, epochs=100, batch_size=BATCH_SIZE, lr=lr, min_eps=best_params['min_eps'], clamp_g=best_params['clamp_g'], use_scheduler=True)
        
        # Calculate L1 distance for this run
        l1 = calculate_l1_distance(trained_gflownet.forward_flow, KernelEnvironment, MAX_LEN, X, Y)
        run_l1_distances.append(l1)

        # Sample and evaluate likelihood and RMSE
        eval_env = KernelEnvironment(batch_size=100, max_trajectory_length=MAX_LEN, log_reward=log_reward_fn)
        trained_gflownet.eval()
        final_batch = trained_gflownet.sample(eval_env)
        
        likelihoods = [utils.evaluate_likelihood(k, X_test, Y_test) for k in final_batch.state]
        rmses = [calculate_rmse(k, X, Y, X_test, Y_test) for k in final_batch.state]
        
        run_mean_likelihoods.append(np.mean(likelihoods))
        run_max_likelihoods.append(np.max(likelihoods))
        run_mean_rmses.append(np.mean(rmses))

    # Calculate statistics over the N runs
    mean_of_l1s = np.mean(run_l1_distances)
    std_of_l1s = np.std(run_l1_distances)
    mean_of_rmses = np.mean(run_mean_rmses)
    std_of_rmses = np.std(run_mean_rmses)
    mean_of_means = np.mean(run_mean_likelihoods)
    std_of_means = np.std(run_mean_likelihoods)
    mean_of_maxs = np.mean(run_max_likelihoods)
    std_of_maxs = np.std(run_max_likelihoods)
    
    print(f"  => Final Stats for '{criterion}':")
    print(f"     - Mean L1 Distance:         {mean_of_l1s:.4f} (Std: {std_of_l1s:.4f})")
    print(f"     - Mean RMSE:                {mean_of_rmses:.4f} (Std: {std_of_rmses:.4f})")
    print(f"     - Mean of Mean Likelihoods: {mean_of_means:.4f} (Std: {std_of_means:.4f})")
    print(f"     - Mean of Max Likelihoods:  {mean_of_maxs:.4f} (Std: {std_of_maxs:.4f})\n")

    final_stats_results.append({
        'Criterion': criterion,
        'Mean L1': f"{mean_of_l1s:.4f}",
        'Std L1': f"{std_of_l1s:.4f}",
        'Mean RMSE': f"{mean_of_rmses:.4f}",
        'Std RMSE': f"{std_of_rmses:.4f}",
        'Mean of Means (LL)': f"{mean_of_means:.4f}",
        'Std of Means (LL)': f"{std_of_means:.4f}",
        'Mean of Maxs (LL)': f"{mean_of_maxs:.4f}",
        'Std of Maxs (LL)': f"{std_of_maxs:.4f}"
    })

# --- 6. Final Statistics Summary Table ---
if final_stats_results:
    print("\n" + "="*50)
    print("--- Final Likelihood, L1 & RMSE Statistics Summary ---")
    print("="*50)
    df_stats = pd.DataFrame(final_stats_results)
    print(df_stats.to_string(index=False))



--- Robust Evaluation from Best Hyperparameters ---

Starting robust evaluation for criterion 'db' with params: {'lr': 0.01, 'BATCH_SIZE': 64, 'criterion': 'db', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}
  -> Run 1/3...


100%|██████████| 100/100 [00:03<00:00, 32.89it/s, loss=2.68]


  -> Run 2/3...


100%|██████████| 100/100 [00:03<00:00, 32.12it/s, loss=7.64]


  -> Run 3/3...


100%|██████████| 100/100 [00:03<00:00, 26.38it/s, loss=4.56]


  => Final Stats for 'db':
     - Mean L1 Distance:         0.8276 (Std: 0.0566)
     - Mean RMSE:                0.5127 (Std: 0.0191)
     - Mean of Mean Likelihoods: 25.6883 (Std: 5.7730)
     - Mean of Max Likelihoods:  43.8691 (Std: 0.2526)

Starting robust evaluation for criterion 'tb' with params: {'lr': 0.0001, 'BATCH_SIZE': 64, 'criterion': 'tb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}
  -> Run 1/3...


100%|██████████| 100/100 [00:03<00:00, 27.29it/s, loss=34.8]


  -> Run 2/3...


100%|██████████| 100/100 [00:02<00:00, 39.41it/s, loss=46.8]


  -> Run 3/3...


100%|██████████| 100/100 [00:03<00:00, 32.48it/s, loss=34.3]


  => Final Stats for 'tb':
     - Mean L1 Distance:         0.9467 (Std: 0.0706)
     - Mean RMSE:                0.4541 (Std: 0.0058)
     - Mean of Mean Likelihoods: 5.9369 (Std: 1.6735)
     - Mean of Max Likelihoods:  43.5927 (Std: 0.7228)

Starting robust evaluation for criterion 'subtb' with params: {'lr': 0.0001, 'BATCH_SIZE': 256, 'criterion': 'subtb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}
  -> Run 1/3...


100%|██████████| 100/100 [00:08<00:00, 12.15it/s, loss=86.5]


  -> Run 2/3...


100%|██████████| 100/100 [00:08<00:00, 12.49it/s, loss=79.3]


  -> Run 3/3...


100%|██████████| 100/100 [00:08<00:00, 11.70it/s, loss=91.2]


  => Final Stats for 'subtb':
     - Mean L1 Distance:         0.8508 (Std: 0.0625)
     - Mean RMSE:                0.4662 (Std: 0.0077)
     - Mean of Mean Likelihoods: 9.6603 (Std: 1.4116)
     - Mean of Max Likelihoods:  43.9087 (Std: 0.2759)

Starting robust evaluation for criterion 'cb' with params: {'lr': 0.001, 'BATCH_SIZE': 256, 'criterion': 'cb', 'epsilon': 0.5, 'min_eps': 0.01, 'clamp_g': 1.0}
  -> Run 1/3...


100%|██████████| 100/100 [00:08<00:00, 12.04it/s, loss=27.9]


  -> Run 2/3...


100%|██████████| 100/100 [00:08<00:00, 11.70it/s, loss=27.6]


  -> Run 3/3...


100%|██████████| 100/100 [00:08<00:00, 11.36it/s, loss=26.4]


  => Final Stats for 'cb':
     - Mean L1 Distance:         0.7006 (Std: 0.0101)
     - Mean RMSE:                0.5073 (Std: 0.0056)
     - Mean of Mean Likelihoods: 24.0177 (Std: 2.4393)
     - Mean of Max Likelihoods:  43.7872 (Std: 0.4477)


--- Final Likelihood, L1 & RMSE Statistics Summary ---
Criterion Mean L1 Std L1 Mean RMSE Std RMSE Mean of Means (LL) Std of Means (LL) Mean of Maxs (LL) Std of Maxs (LL)
       db  0.8276 0.0566    0.5127   0.0191            25.6883            5.7730           43.8691           0.2526
       tb  0.9467 0.0706    0.4541   0.0058             5.9369            1.6735           43.5927           0.7228
    subtb  0.8508 0.0625    0.4662   0.0077             9.6603            1.4116           43.9087           0.2759
       cb  0.7006 0.0101    0.5073   0.0056            24.0177            2.4393           43.7872           0.4477


In [15]:
greedy_kernel, _ = greedy_search(X, Y, method='BIC', max_steps=5)

[Step 1] BIC: -71.95 LL: 41.08 | RBF({'lengthscale': 1.0, 'variance': 1.0})
[Step 2] BIC: -68.66 LL: 41.13 | (RBF({'lengthscale': 1.0, 'variance': 1.0}) + Linear({'variances': 1.0}))
[Step 3] BIC: -71.95 LL: 41.08 | RBF({'lengthscale': 1.0, 'variance': 1.0})
[Step 4] BIC: -68.66 LL: 41.13 | (RBF({'lengthscale': 1.0, 'variance': 1.0}) + Linear({'variances': 1.0}))
[Step 5] BIC: -71.95 LL: 41.08 | RBF({'lengthscale': 1.0, 'variance': 1.0})


In [16]:
ll_test = utils.evaluate_likelihood(true_kernel, X_test, Y_test, runtime=False)
rmse_test = evaluation.calculate_rmse(true_kernel, X, Y, X_test, Y_test)
    
print("True kernel")
print(" Log Marginal Likelihood:", ll_test, "RMSE:", rmse_test)
print("===========================================")


ll_test = utils.evaluate_likelihood(greedy_kernel, X_test, Y_test, runtime=False)
rmse_test = evaluation.calculate_rmse(greedy_kernel, X, Y, X_test, Y_test)
    
print("Greedy Kernel (automated statistician)")
print("Log Marginal Likelihood:", ll_test, "RMSE:", rmse_test)
print("===========================================")

rbf_kernel = KernelFunction().rbf()
linear_kernel = KernelFunction().linear()
periodic_kernel = KernelFunction().periodic()
constant_kernel = KernelFunction().constant()

kernels = [rbf_kernel, linear_kernel, periodic_kernel, constant_kernel]
for kernel in kernels:
    ll_test = utils.evaluate_likelihood(kernel, X_test, Y_test, runtime=False)
    rmse_test = evaluation.calculate_rmse(kernel, X, Y, X_test, Y_test)
    
    print(f"{kernel.name} Kernel")
    print("Log Marginal Likelihood:", ll_test, "RMSE:", rmse_test)
print("===========================================")



True kernel
 Log Marginal Likelihood: 46.49716139384765 RMSE: 0.5454070797996129
Greedy Kernel (automated statistician)
Log Marginal Likelihood: 46.08568974349245 RMSE: 0.519324359905318
RBF Kernel
Log Marginal Likelihood: 46.08568974349245 RMSE: 0.519324359905318
Linear Kernel
Log Marginal Likelihood: -5.300491805256389 RMSE: 0.37577574326011004
Periodic Kernel
Log Marginal Likelihood: -9.69355572690653 RMSE: 0.37554631914616465
Constant Kernel
Log Marginal Likelihood: -10.41643637246882 RMSE: 0.37100318595675863
