In [None]:
import warnings; warnings.simplefilter('ignore')
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

import matplotlib.pyplot as plt
import numpy as np

from scipy.optimize import Bounds

from crbo import ConfidenceRegionBayesianOptimization
from crbo.objectives import CartPoleObjective
from crbo import util

np.random.seed(42)


In [None]:

# Values for the policy domain size can be found in the Appendix
objective = CartPoleObjective(policy_domain_size=10.0)
bounds = Bounds(lb=objective.lb, ub=objective.ub)

# Start with one initial point in the domain's center
x_init = np.atleast_2d((objective.lb + objective.ub) / 2)

# Sample more points for the initial design if the objective has higher dimensionality
n_init = int(np.ceil(np.sqrt(objective.dim)))

# Set GP hyperparameters to reasonable values
init_hypers = {"lengthscale": 0.1, "variance": 1.0 ** 2, "noise_var": 0.05 ** 2}

# Set up Conf Region BO
bo = ConfidenceRegionBayesianOptimization(
    fun=objective,
    gamma = 0.6,
    acq_type="lcb",
    max_iter=100 - n_init,
    bounds=bounds,
    x_init=x_init,
    n_init=n_init,
    verbosity=True,
    optimize_hypers=True,
    init_hypers=init_hypers,
    normalizer_type="optimistic"
)


In [None]:
# Let's go
model, results = bo.run()

In [None]:
# Predict outcome of GP model at evaluated locations
x_eval = np.array(results["x_eval"])
x_eval_normalized = util.normalize_input(x_eval, bounds)
mu_eval = util.denormalize_output(model.predict(x_eval_normalized)[0], bo.norm_mu, bo.norm_std)
mu_eval *= -1.0  # CRBO minimizes the objective, thus flip values for visualization
y_eval = -1.0 * np.array(results["y_eval"])

# Best policy is the one with highest predictive mean at all evaluated locations
best_idx = np.argmax(mu_eval)
x_opt = x_eval[best_idx]

# Evaluate initial and final policy
y_opt = -1 * objective(np.tile(x_opt, [20, 1]))
y_init = -1 * objective(np.tile(x_init, [20, 1]))

In [None]:
# Visualize some of the results
plt.figure(figsize=(15, 5))
plt.subplot(131)
plt.title("Outcome of evaluated policies")
plt.plot(y_eval, label="Observed returns")
plt.plot(mu_eval, label="GP mean at observed locations")
plt.xlabel("# Episodes")
plt.ylabel("Episodic return")
plt.legend()
plt.subplot(132)
avg_reward = np.cumsum(y_eval).squeeze() / (np.arange(y_eval.shape[0]) + 1)
plt.plot(avg_reward, label="CRBO")
plt.xlabel("# Episodes")
plt.ylabel("Average Episodic return")
plt.legend()
plt.subplot(133)
bins = np.linspace(0.0, 1.0, 20)
plt.hist(y_init, bins=bins, label="Initial policy")
plt.hist(y_opt, bins=bins, label="Optimized policy")
plt.xlabel("Episodic return")
plt.legend()
