# Safe Learning for a Cart Pole System

In [26]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
import gpflow
import safe_learning
import matplotlib.pyplot as plt
import time
import os
import math

from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
from scipy.linalg import block_diag
from utilities import CartPole, compute_closedloop_response, get_parameter_change, find_nearest, reward_rollout, compute_roa, binary_cmap

# Nice progress bars
try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

## TensorFlow Session

Customize the TensorFlow session for the current device.

In [27]:
class Options(object):
    def __init__(self, **kwargs):
        super(Options, self).__init__()
        self.__dict__.update(kwargs)

OPTIONS = Options(np_dtype              = safe_learning.config.np_dtype,
                  tf_dtype              = safe_learning.config.dtype,
                  saturate              = True,                            # apply saturation constraints to the control input
                  eps                   = 1e-8,                            # numerical tolerance
                  use_linear_dynamics   = False,                           # use the linearized form of the dynamics as the true dynamics (for testing)
                  dpi                   = 200,
                  num_cores             = 4,
                  num_sockets           = 1)

# Open a new session (close old one if exists)
try:
    session.close()
except NameError:
    pass

session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())

## Define underlying dynamic system and costs/rewards
Define the dynamics of the true and false system


In [42]:
# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# True system parameters
m = 0.2    # pendulum mass
M = 1.0    # cart mass
L = 0.5    # pole length
b = 0.2     # rotational friction

# 'Wrong' model parameters
mw = 0.2    # pendulum mass
Mw = 1.0    # cart mass
Lw = 0.5    # pole length
bw = 0.0    # rotational friction

# State and action normalizers
x_max         = 0.5                                 # linear position [m]
theta_max     = np.deg2rad(120)                      # angular position [rad]
x_dot_max     = 2                                   # linear velocity [m/s]
theta_dot_max = np.deg2rad(30)                      # angular velocity [rad/s]
u_max         = (m + M) * (x_dot_max ** 2) / x_max  # linear force [N], control action

state_norm = (x_max, theta_max, x_dot_max, theta_dot_max)
action_norm = (u_max,)

# Dimensions and domains
state_dim     = 4
action_dim    = 1
state_limits  = np.array([[-math.pi, math.pi]] * state_dim)
action_limits = np.array([[-2., 2.]] * action_dim)
print(action_limits)
# Initialize system class and its linearization
true_dynamics = CartPole(m, M, L, b, dt, [state_norm, action_norm])
wrong_dynamics = CartPole(mw, Mw, Lw, bw, dt, [state_norm, action_norm])
    
# LQR cost matrices
Q = 0.1 * np.identity(state_dim).astype(OPTIONS.np_dtype)     # state cost matrix
R = 0.1 * np.identity(action_dim).astype(OPTIONS.np_dtype)    # action cost matrix

# Quadratic reward/cost function
reward_function = safe_learning.QuadraticFunction(block_diag(- Q, - R), name='reward_function')

### Quadratic (LQR) reward function

#reward_function = safe_learning.QuadraticFunction(linalg.block_diag(-q, -r))

[[-2.  2.]]


In [62]:
num_states = [51, 51, 51, 51]

# State grid
state_limits = np.array([[-1., 1.], ] * state_dim)

safety_disc = safe_learning.GridWorld(state_limits, num_states)
policy_disc = safe_learning.GridWorld(state_limits, [50, 50, 50, 50])

# Discretization constant
tau = np.min(safety_disc.unit_maxes)

pivot_state = np.asarray([0., 0., 0., 0.], dtype=OPTIONS.np_dtype)

# Snap pivot_state to the closest grid point
pivot_index = np.zeros_like(pivot_state, dtype=int)
for d in range(safety_disc.ndim):
    pivot_index[d], pivot_state[d] = find_nearest(safety_disc.discrete_points[d], pivot_state[d])

# Get 2d-planes of the discretization (x vs. v, theta vs. omega) according to pivot_state
planes = [[1, 3], [0, 2]]
safety_slices = []
for p in planes:
    safety_slices.append(np.logical_and(safety_disc.all_points[:, p[0]] == pivot_state[p[0]], 
                                      safety_disc.all_points[:, p[1]] == pivot_state[p[1]]).ravel())
print(safety_disc)
print('Grid size: {0}'.format(safety_disc.nindex))

# Snap pivot_state to the closest grid point
pivot_index = np.zeros_like(pivot_state, dtype=int)
for d in range(policy_disc.ndim):
    pivot_index[d], pivot_state[d] = find_nearest(policy_disc.discrete_points[d], pivot_state[d])

# Get 2d-planes of the discretization (x vs. v, theta vs. omega) according to pivot_state
planes = [[1, 3], [0, 2]]
policy_slices = []
for p in planes:
    policy_slices.append(np.logical_and(policy_disc.all_points[:, p[0]] == pivot_state[p[0]], 
                                      policy_disc.all_points[:, p[1]] == pivot_state[p[1]]).ravel())
print(shape(policy_disc))
print('Grid size: {0}'.format(policy_disc.nindex))

<safe_learning.functions.GridWorld object at 0x7f0d4d1749d0>
Grid size: 6765201


NameError: name 'shape' is not defined

### LQR init policy

## Define the GP dynamics model

We use a combination of kernels to model the errors in the dynamics

In [67]:
A, B = wrong_dynamics.linearize()
lipschitz_dynamics = 1

noise_var = 0.001 ** 2

m_true = np.hstack((true_dynamics.linearize()))
m = np.hstack((A, B))

variances = (m_true - m) ** 2

# Make sure things remain 
np.clip(variances, 1e-5, None, out=variances)

# Kernels
kernel1 = (gpflow.kernels.Linear(5, variance=variances[0, :], ARD=True)
           + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * gpflow.kernels.Linear(1, variance=variances[0, 3]))

kernel2 = (gpflow.kernels.Linear(5, variance=variances[1, :], ARD=True)
           + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * gpflow.kernels.Linear(1, variance=variances[1, 3]))

kernel3 = (gpflow.kernels.Linear(5, variance=variances[2, :], ARD=True)
           + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * gpflow.kernels.Linear(1, variance=variances[2, 3]))

kernel4 = (gpflow.kernels.Linear(5, variance=variances[3, :], ARD=True)
           + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * gpflow.kernels.Linear(1, variance=variances[3, 3]))

# Mean dynamics

mean_dynamics = safe_learning.LinearSystem((A, B), name='mean_dynamics')
mean_function1 = safe_learning.LinearSystem((A[[0], :], B[[0], :]), name='mean_dynamics_1')
mean_function2 = safe_learning.LinearSystem((A[[1], :], B[[1], :]), name='mean_dynamics_2')
mean_function3 = safe_learning.LinearSystem((A[[2], :], B[[2], :]), name='mean_dynamics_3')
mean_function4 = safe_learning.LinearSystem((A[[3], :], B[[3], :]), name='mean_dynamics_4')

# Define a GP model over the dynamics
gp1 = gpflow.gpr.GPR(np.empty((0, 3), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel1,
                    mean_function=mean_function1)
gp1.likelihood.variance = noise_var

gp2 = gpflow.gpr.GPR(np.empty((0, 3), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel2,
                    mean_function=mean_function2)
gp2.likelihood.variance = noise_var

gp3 = gpflow.gpr.GPR(np.empty((0,3), dtype=safe_learning.config.np_dtype),
                    np.empty((0,1), dtype=safe_learning.config.np_dtype),
                    kernel3,
                    mean_function=mean_function3)
gp3.likelihood.variance = noise_var

gp4 = gpflow.gpr.GPR(np.empty((0, 3), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel4,
                    mean_function=mean_function4)
gp4.likelihood.variance = noise_var


gp1_fun = safe_learning.GaussianProcess(gp1)
gp2_fun = safe_learning.GaussianProcess(gp2)
gp3_fun = safe_learning.GaussianProcess(gp3)
gp4_fun = safe_learning.GaussianProcess(gp4)

dynamics = safe_learning.FunctionStack((gp1_fun, gp2_fun, gp3_fun, gp4_fun))

In [68]:
# Compute the optimal policy for the linear (and wrong) mean dynamics
k, s = safe_learning.utilities.dlqr(A, B, Q, R)
init_policy = safe_learning.LinearSystem((-k), name='initial_policy')
init_policy = safe_learning.Saturation(init_policy, -1, 1)

# Define the Lyapunov function corresponding to the initial policy
init_lyapunov = safe_learning.QuadraticFunction(s)

## Set up the dynamic programming problem

In [69]:
# Define a neural network policy
relu = tf.nn.relu
policy = safe_learning.NeuralNetwork(layers=[32, 32, 1],
                                     nonlinearities=[relu, relu, tf.nn.tanh],
                                     output_scale=action_limits[0, 1])

# Define value function approximation
value_function = safe_learning.Triangulation(policy_disc,
                                             -init_lyapunov(policy_disc.all_points).eval(),
                                             project=True)
gamma = 0.98
# Define policy optimization problem
rl = safe_learning.PolicyIteration(
    policy,
    dynamics,
    reward_function,
    value_function,
    gamma=gamma)
    

with tf.name_scope('rl_mean_optimization'):
    rl_opt_value_function = rl.optimize_value_function()
    
    # Placeholder for states
    tf_states_mean = tf.placeholder(safe_learning.config.dtype, [None, state_dim])
    
    # Optimize for expected gain
    values = rl.future_values(tf_states_mean)
    policy_loss = -tf.reduce_mean(values)
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    adapt_policy_mean = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)


In [70]:
# Start the session
session.run(tf.global_variables_initializer())

### Run initial dynamic programming for the mean dynamics

In [71]:
for i in tqdm(range(3000)):
    
    # select random training batches
    rl.feed_dict[tf_states_mean] = policy_disc.sample_continuous(1000)

    session.run(adapt_policy_mean, feed_dict=rl.feed_dict)


  0%|          | 0/3000 [00:00<?, ?it/s][A

InvalidArgumentError: assertion failed: [] [Condition x == y did not hold element-wise:] [x (gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/strided_slice_2:0) = ] [3] [y (gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/Const:0) = ] [5]
	 [[node gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert (defined at /home/petar/anaconda2/lib/python2.7/site-packages/gpflow/kernels.py:85)  = Assert[T=[DT_STRING, DT_STRING, DT_STRING, DT_INT32, DT_STRING, DT_INT32], summarize=3, _device="/job:localhost/replica:0/task:0/device:CPU:0"](gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/All, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_0, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_1, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_2, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/strided_slice_2, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_4, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/Const)]]

Caused by op u'gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert', defined at:
  File "/home/petar/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/petar/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/petar/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 1073, in start
    handler_func(fd_obj, events)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/petar/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2714, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2818, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/petar/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2878, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-69-a8c6d29c48d7>", line 28, in <module>
    values = rl.future_values(tf_states_mean)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/utilities.py", line 109, in wrapped_function
    return function(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/reinforcement_learning.py", line 94, in future_values
    next_states = self.dynamics(states, actions)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/utilities.py", line 119, in wrapped_function
    return function(self, *args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/functions.py", line 81, in __call__
    outputs = self._template(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/template.py", line 360, in __call__
    return self._call_func(args, kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/template.py", line 306, in _call_func
    result = self._func(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/utilities.py", line 144, in wrapped_function
    return function(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/functions.py", line 284, in build_evaluation
    mean, error = fun(points)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/utilities.py", line 119, in wrapped_function
    return function(self, *args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/functions.py", line 81, in __call__
    outputs = self._template(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/template.py", line 360, in __call__
    return self._call_func(args, kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/template.py", line 306, in _call_func
    result = self._func(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/utilities.py", line 141, in wrapped_function
    return function(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/safe_learning/functions.py", line 512, in build_evaluation
    mean, var = self.gaussian_process.build_predict(points)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/gpr.py", line 76, in build_predict
    Kx = self.kern.K(self.X, Xnew)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/scoping.py", line 43, in runnable
    return f(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/kernels.py", line 762, in K
    return reduce(tf.add, [k.K(X, X2) for k in self.kern_list])
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/scoping.py", line 43, in runnable
    return f(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/kernels.py", line 386, in K
    X, X2 = self._slice(X, X2)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/gpflow/kernels.py", line 85, in _slice
    tf.assert_equal(tf.shape(X)[1], tf.constant(self.input_dim, dtype=settings.dtypes.int_type))
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/check_ops.py", line 390, in assert_equal
    return control_flow_ops.Assert(condition, data, summarize=summarize)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py", line 189, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 159, in Assert
    return gen_logging_ops._assert(condition, data, summarize, name="Assert")
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 52, in _assert
    name=name)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/petar/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): assertion failed: [] [Condition x == y did not hold element-wise:] [x (gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/strided_slice_2:0) = ] [3] [y (gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/Const:0) = ] [5]
	 [[node gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert (defined at /home/petar/anaconda2/lib/python2.7/site-packages/gpflow/kernels.py:85)  = Assert[T=[DT_STRING, DT_STRING, DT_STRING, DT_INT32, DT_STRING, DT_INT32], summarize=3, _device="/job:localhost/replica:0/task:0/device:CPU:0"](gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/All, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_0, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_1, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_2, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/strided_slice_2, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/assert_equal/Assert/Assert/data_4, gaussian_process_20/evaluate_1/name.kern.K/name.kern.linear.K/Const)]]


## Define the Lyapunov function

Here we use the fact that the optimal value function is a Lyapunov function for the optimal policy if the dynamics are deterministic. As uncertainty about the dynamics decreases, the value function for the mean dynamics will thus converge to a Lyapunov function.

### Set up a discretization for safety verification

In [None]:
lyapunov_function = -rl.value_function
lipschitz_lyapunov = lambda x: tf.reduce_max(tf.abs(rl.value_function.gradient(x)),
                                             axis=1, keepdims=True)

lipschitz_policy = lambda x: policy.lipschitz() 

a_true, b_true = true_dynamics.linearize()
lipschitz_dynamics = lambda x: np.max(np.abs(a_true)) + np.max(np.abs(b_true)) * lipschitz_policy(x)

# Lyapunov function definitial
lyapunov = safe_learning.Lyapunov(safety_disc,
                                  lyapunov_function,
                                  dynamics,
                                  lipschitz_dynamics,
                                  lipschitz_lyapunov,
                                  tau,
                                  policy=rl.policy,
                                  initial_set=None)

# Set initial safe set (level set) based on initial Lyapunov candidate
values = init_lyapunov(safety_disc.all_points).eval()
cutoff = np.max(values) * 0.005

lyapunov.initial_safe_set = np.squeeze(values, axis=1) <= cutoff

In [None]:
def plot_safe_set(lyapunov, show=True):
    """Plot the safe set for a given Lyapunov function."""
    plt.imshow(lyapunov.safe_set.reshape(num_states).T,
               origin='lower',
               extent=lyapunov.discretization.limits.ravel(),
               vmin=0,
               vmax=1)
    
    if isinstance(lyapunov.dynamics, safe_learning.UncertainFunction):
        X = lyapunov.dynamics.functions[0].X
        plt.plot(X[:, 0], X[:, 1], 'rx')
    
    plt.title('safe set')
    plt.colorbar()
    if show:
        plt.show()
    
lyapunov.update_safe_set()
plot_safe_set(lyapunov)

## Safe policy update

We do dynamic programming, but enfore the decrease condition on the Lyapunov function using a Lagrange multiplier

In [None]:
with tf.name_scope('policy_optimization'):
    
    # Placeholder for states
    tf_states = tf.placeholder(safe_learning.config.dtype, [None, 2])
    
    # Add Lyapunov uncertainty (but only if safety-relevant)
    values = rl.future_values(tf_states, lyapunov=lyapunov)
    
    policy_loss = -tf.reduce_mean(values)
    

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    adapt_policy = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)
    
    
def rl_optimize_policy(num_iter):
    # Optimize value function
    session.run(rl_opt_value_function, feed_dict=rl.feed_dict)

    # select random training batches
    for i in tqdm(range(num_iter)):
        rl.feed_dict[tf_states] = lyapunov.discretization.sample_continuous(1000)

        session.run(adapt_policy, feed_dict=rl.feed_dict)

# Exploration

We explore close to the current policy by sampling the most uncertain state that does not leave the current level set

In [None]:
action_variation = np.array([[-0.02], [0.], [0.02]], dtype=safe_learning.config.np_dtype)


with tf.name_scope('add_new_measurement'):
        action_dim = lyapunov.policy.output_dim
        tf_max_state_action = tf.placeholder(safe_learning.config.dtype,
                                             shape=[1, safety_disc.ndim + action_dim])
        tf_measurement = true_dynamics(tf_max_state_action)
        
def update_gp():
    """Update the GP model based on an actively selected data point."""
    # Get a new sample location
    max_state_action, _ = safe_learning.get_safe_sample(lyapunov,
                                                        action_variation,
                                                        action_limits,
                                                        num_samples=1000)

    # Obtain a measurement of the true dynamics
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)

    # Add the measurement to our GP dynamics
    lyapunov.dynamics.add_data_point(max_state_action, measurement)
    

# Run the optimization

In [None]:
# lyapunov.update_safe_set()
rl_optimize_policy(num_iter=500)
rl_optimize_policy(num_iter=500)

lyapunov.update_safe_set()
plot_safe_set(lyapunov)

In [None]:
for i in range(5):
    print('iteration {} with c_max: {}'.format(i, lyapunov.feed_dict[lyapunov.c_max]))
    for i in tqdm(range(10)):
        update_gp()
    
    rl_optimize_policy(num_iter=200)
    lyapunov.update_values()
    
    # Update safe set and plot
    lyapunov.update_safe_set()
    plot_safe_set(lyapunov)    

# Plot trajectories and analyse improvement

In [None]:
x0 = np.array([[math.pi*0.5, -0.0]])

states_new, actions_new = safe_learning.utilities.compute_trajectory(true_dynamics, rl.policy, x0, 500)
states_old, actions_old = safe_learning.utilities.compute_trajectory(true_dynamics, init_policy, x0, 500)

t = np.arange(len(states_new)) * true_dynamics.dt

In [None]:
plt.plot(t, states_new[:, 0], label='new')
plt.plot(t, states_old[:, 0], label='old')
plt.xlabel('time [s]')
plt.ylabel('angle [rad]')
plt.legend()
plt.show()

plt.plot(t, states_new[:, 1], label='new')
plt.plot(t, states_old[:, 1], label='old')
plt.xlabel('time [s]')
plt.ylabel('angular velocity [rad/s]')
plt.legend()
plt.show()

In [None]:
plt.plot(t[:-1], actions_new, label='new')
plt.plot(t[:-1], actions_old, label='old')
plt.xlabel('time [s]')
plt.ylabel('actions')
plt.legend()

In [None]:
print('reward old:', tf.reduce_sum(rl.reward_function(states_old[:-1], actions_old)).eval(feed_dict=rl.feed_dict))
print('reward new:', tf.reduce_sum(rl.reward_function(states_new[:-1], actions_new)).eval(feed_dict=rl.feed_dict))