# Safe Learning for a Cart Pole System

In [1]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
import gpflow
import safe_learning
import matplotlib.pyplot as plt
import time
import os
import math

from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
from scipy.linalg import block_diag
from utilities import CartPole, compute_closedloop_response, get_parameter_change, find_nearest, reward_rollout, compute_roa, binary_cmap
%matplotlib inline
# Nice progress bars
try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

  from ._conv import register_converters as _register_converters


In [2]:
class Options(object):
    def __init__(self, **kwargs):
        super(Options, self).__init__()
        self.__dict__.update(kwargs)

OPTIONS = Options(np_dtype              = safe_learning.config.np_dtype,
                  tf_dtype              = safe_learning.config.dtype,
                  saturate              = True,                            # apply saturation constraints to the control input
                  eps                   = 1e-8,                            # numerical tolerance
                  use_linear_dynamics   = False,                           # use the linearized form of the dynamics as the true dynamics (for testing)
                  dpi                   = 200,
                  num_cores             = 4,
                  num_sockets           = 1)


## TensorFlow Session

Customize the TensorFlow session for the current device.

In [3]:
# Start the session
# Open a new session (close old one if exists)


os.environ["KMP_BLOCKTIME"]    = str(0)
os.environ["KMP_SETTINGS"]     = str(1)
os.environ["KMP_AFFINITY"]     = 'granularity=fine,noverbose,compact,1,0'
os.environ["OMP_NUM_THREADS"]  = str(OPTIONS.num_cores)

config = tf.ConfigProto(intra_op_parallelism_threads  = OPTIONS.num_cores,
                        inter_op_parallelism_threads  = OPTIONS.num_sockets,
                        allow_soft_placement          = False,
                        device_count                  = {'CPU': OPTIONS.num_cores})

try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession(config=config)
session.run(tf.global_variables_initializer())

## Define underlying dynamic system and costs/rewards
Define the dynamics of the true and false system


In [4]:
# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# True system parameters
m = 0.2    # pendulum mass
M = 1.0    # cart mass
L = 0.5    # pole length
b = 0.2     # rotational friction

# 'Wrong' model parameters
mw = 0.2    # pendulum mass
Mw = 1.0    # cart mass
Lw = 0.5    # pole length
bw = 0.0    # rotational friction

# State and action normalizers
x_max         = 0.5                                 # linear position [m]
theta_max     = np.deg2rad(120)                      # angular position [rad]
x_dot_max     = 2                                   # linear velocity [m/s]
theta_dot_max = np.deg2rad(30)                      # angular velocity [rad/s]
u_max         = (m + M) * (x_dot_max ** 2) / x_max  # linear force [N], control action

state_norm = (x_max, theta_max, x_dot_max, theta_dot_max)
action_norm = (u_max,)

# Dimensions and domains
state_dim     = 4
action_dim    = 1
state_limits  = np.array([[-math.pi, math.pi]] * state_dim)
action_limits = np.array([[-2., 2.]] * action_dim)
print(action_limits)
# Initialize system class and its linearization
true_dynamics = CartPole(m, M, L, b, dt, [state_norm, action_norm])
wrong_dynamics = CartPole(mw, Mw, Lw, bw, dt, [state_norm, action_norm])
    
# LQR cost matrices
Q = 0.1 * np.identity(state_dim).astype(OPTIONS.np_dtype)     # state cost matrix
R = 0.1 * np.identity(action_dim).astype(OPTIONS.np_dtype)    # action cost matrix

# Quadratic reward/cost function
reward_function = safe_learning.QuadraticFunction(block_diag(- Q, - R), name='reward_function')

### Quadratic (LQR) reward function

#reward_function = safe_learning.QuadraticFunction(linalg.block_diag(-q, -r))

[[-2.  2.]]


In [5]:
num_states = [51, 51, 51, 51]

# State grid
state_limits = np.array([[-1., 1.], ] * state_dim)

safety_disc = safe_learning.GridWorld(state_limits, num_states)
policy_disc = safe_learning.GridWorld(state_limits, [20, 20, 20, 20])

# Discretization constant
tau = np.min(safety_disc.unit_maxes)

pivot_state = np.asarray([0., 0., 0., 0.], dtype=OPTIONS.np_dtype)

# Snap pivot_state to the closest grid point
pivot_index = np.zeros_like(pivot_state, dtype=int)
for d in range(safety_disc.ndim):
    pivot_index[d], pivot_state[d] = find_nearest(safety_disc.discrete_points[d], pivot_state[d])

# Get 2d-planes of the discretization (x vs. v, theta vs. omega) according to pivot_state
planes = [[1, 3], [0, 2]]
safety_slices = []
for p in planes:
    safety_slices.append(np.logical_and(safety_disc.all_points[:, p[0]] == pivot_state[p[0]], 
                                      safety_disc.all_points[:, p[1]] == pivot_state[p[1]]).ravel())
print(safety_disc)
print('Grid size: {0}'.format(safety_disc.nindex))

# Snap pivot_state to the closest grid point
pivot_index = np.zeros_like(pivot_state, dtype=int)
for d in range(policy_disc.ndim):
    pivot_index[d], pivot_state[d] = find_nearest(policy_disc.discrete_points[d], pivot_state[d])

# Get 2d-planes of the discretization (x vs. v, theta vs. omega) according to pivot_state
planes = [[1, 3], [0, 2]]
policy_slices = []
for p in planes:
    policy_slices.append(np.logical_and(policy_disc.all_points[:, p[0]] == pivot_state[p[0]], 
                                      policy_disc.all_points[:, p[1]] == pivot_state[p[1]]).ravel())
print((policy_disc))
print('Grid size: {0}'.format(policy_disc.nindex))

<safe_learning.functions.GridWorld object at 0x0000020D5A2F4080>
Grid size: 6765201
<safe_learning.functions.GridWorld object at 0x0000020D5A2F4048>
Grid size: 160000


### LQR init policy

## Define the GP dynamics model

We use a combination of kernels to model the errors in the dynamics

In [6]:
A, B = wrong_dynamics.linearize()
lipschitz_dynamics = 1
print(np.shape(A))
print(np.shape(B))
noise_var = 0.001 ** 2

m_true = np.hstack((true_dynamics.linearize()))
m = np.hstack((A, B))

variances = (m_true - m) ** 2
print(variances)
# Input to GP is of the form (x, u) = (state, action)
full_dim = state_dim + action_dim
# Make sure things remain 
np.clip(variances, 1e-5, None, out=variances)

# Kernels (just linear for now)
kernel1 = (gpflow.kernels.Linear(full_dim, variance=variances[0, :], ARD=True))
          # + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           #* gpflow.kernels.Linear(1, variance=variances[0, 3]))

kernel2 = (gpflow.kernels.Linear(full_dim, variance=variances[1, :], ARD=True))
          # + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
          # * gpflow.kernels.Linear(1, variance=variances[1, 3]))

kernel3 = (gpflow.kernels.Linear(full_dim, variance=variances[2, :], ARD=True))
          # + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
          # * gpflow.kernels.Linear(1, variance=variances[2, 3]))

kernel4 = (gpflow.kernels.Linear(full_dim, variance=variances[3, :], ARD=True))
          # + gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
          # * gpflow.kernels.Linear(1, variance=variances[3, 3]))

# Mean dynamics

mean_dynamics = safe_learning.LinearSystem((A, B), name='mean_dynamics')
mean_function1 = safe_learning.LinearSystem((A[[0], :], B[[0], :]), name='mean_dynamics_1')
mean_function2 = safe_learning.LinearSystem((A[[1], :], B[[1], :]), name='mean_dynamics_2')
mean_function3 = safe_learning.LinearSystem((A[[2], :], B[[2], :]), name='mean_dynamics_3')
mean_function4 = safe_learning.LinearSystem((A[[3], :], B[[3], :]), name='mean_dynamics_4')

# Define a GP model over the dynamics
X_init   = np.zeros((1, full_dim), dtype=OPTIONS.np_dtype)
Y_init   = np.zeros((1, 1), dtype=OPTIONS.np_dtype)

gp1 = gpflow.gpr.GPR(X_init,
                    Y_init,
                    kernel1,
                    mean_function=mean_function1)
gp1.likelihood.variance = noise_var

gp2 = gpflow.gpr.GPR(X_init,
                    Y_init,
                    kernel2,
                    mean_function=mean_function2)
gp2.likelihood.variance = noise_var

gp3 = gpflow.gpr.GPR(X_init,
                    Y_init,
                    kernel3,
                    mean_function=mean_function3)
gp3.likelihood.variance = noise_var

gp4 = gpflow.gpr.GPR(X_init,
                    Y_init,
                    kernel4,
                    mean_function=mean_function4)
gp4.likelihood.variance = noise_var


gp1_fun = safe_learning.GaussianProcess(gp1)
gp2_fun = safe_learning.GaussianProcess(gp2)
gp3_fun = safe_learning.GaussianProcess(gp3)
gp4_fun = safe_learning.GaussianProcess(gp4)

dynamics = safe_learning.FunctionStack((gp1_fun, gp2_fun, gp3_fun, gp4_fun))

(4, 4)
(4, 1)
[[0.00000000e+00 4.22254267e-11 0.00000000e+00 4.25222878e-10
  6.40175101e-12]
 [0.00000000e+00 3.46544747e-10 0.00000000e+00 3.48981091e-09
  5.25392722e-11]
 [0.00000000e+00 2.35709519e-07 0.00000000e+00 1.04708054e-06
  3.57356638e-08]
 [0.00000000e+00 4.95224771e-04 0.00000000e+00 2.19991208e-03
  7.50804889e-05]]


In [7]:
# Compute the optimal policy for the linear (and wrong) mean dynamics
k, s = safe_learning.utilities.dlqr(A, B, Q, R)
init_policy = safe_learning.LinearSystem((-k), name='initial_policy')
init_policy = safe_learning.Saturation(init_policy, -1, 1)

# Define the Lyapunov function corresponding to the initial policy
init_lyapunov = safe_learning.QuadraticFunction(s)

## Set up the dynamic programming problem

In [8]:
# Define a neural network policy
relu = tf.nn.relu
policy = safe_learning.NeuralNetwork(layers=[32, 32, 1],
                                     nonlinearities=[relu, relu, tf.nn.tanh],
                                     output_scale=action_limits[0, 1])

# Define value function approximation
value_function = safe_learning.Triangulation(policy_disc,
                                             -init_lyapunov(policy_disc.all_points).eval(),
                                             project=True)
gamma = 0.98
# Define policy optimization problem
rl = safe_learning.PolicyIteration(
    policy,
    dynamics,
    reward_function,
    value_function,
    gamma=gamma)
    

with tf.name_scope('rl_mean_optimization'):
    rl_opt_value_function = rl.optimize_value_function()
    
    # Placeholder for states
    tf_states_mean = tf.placeholder(OPTIONS.tf_dtype, [None, state_dim])
    
    # Optimize for expected gain
    values = rl.future_values(tf_states_mean)
    policy_loss = -tf.reduce_mean(values)
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    adapt_policy_mean = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)


In [9]:
# Start the session
session.run(tf.global_variables_initializer())

### Run initial dynamic programming for the mean dynamics

In [10]:
for i in tqdm(range(1000)):
    
    # select random training batches
    rl.feed_dict[tf_states_mean] = policy_disc.sample_continuous(100)

    session.run(adapt_policy_mean, feed_dict=rl.feed_dict)

## Define the Lyapunov function

Here we use the fact that the optimal value function is a Lyapunov function for the optimal policy if the dynamics are deterministic. As uncertainty about the dynamics decreases, the value function for the mean dynamics will thus converge to a Lyapunov function.

### Set up a discretization for safety verification

In [11]:
lyapunov_function = -rl.value_function
lipschitz_lyapunov = lambda x: tf.reduce_max(tf.abs(rl.value_function.gradient(x)),
                                             axis=1, keepdims=True)

lipschitz_policy = lambda x: policy.lipschitz() 

a_true, b_true = true_dynamics.linearize()
lipschitz_dynamics = lambda x: np.max(np.abs(a_true)) + np.max(np.abs(b_true)) * lipschitz_policy(x)

# Lyapunov function definitial
lyapunov = safe_learning.Lyapunov(safety_disc,
                                  lyapunov_function,
                                  dynamics,
                                  lipschitz_dynamics,
                                  lipschitz_lyapunov,
                                  tau,
                                  policy=rl.policy,
                                  initial_set=None)

# Set initial safe set (level set) based on initial Lyapunov candidate
values = init_lyapunov(safety_disc.all_points).eval()
cutoff = np.max(values) * 0.005

lyapunov.initial_safe_set = np.squeeze(values, axis=1) <= cutoff

In [12]:
def plot_safe_set(lyapunov, show=True):
    """Plot the safe set for a given Lyapunov function."""
    plt.imshow(lyapunov.safe_set.reshape(num_states).T,
               origin='lower',
               extent=lyapunov.discretization.limits.ravel(),
               vmin=0,
               vmax=1)
    
    if isinstance(lyapunov.dynamics, safe_learning.UncertainFunction):
        X = lyapunov.dynamics.functions[0].X
        plt.plot(X[:, 0], X[:, 1], 'rx')
    
    plt.title('safe set')
    plt.colorbar()
    if show:
        plt.show()
    
lyapunov.update_safe_set()
#plot_safe_set(lyapunov)

## Safe policy update

We do dynamic programming, but enfore the decrease condition on the Lyapunov function using a Lagrange multiplier

In [13]:
with tf.name_scope('policy_optimization'):
    
    # Placeholder for states
    tf_states = tf.placeholder(safe_learning.config.dtype, [None, state_dim])
    
    # Add Lyapunov uncertainty (but only if safety-relevant)
    values = rl.future_values(tf_states, lyapunov=lyapunov)
    
    policy_loss = -tf.reduce_mean(values)
    

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    adapt_policy = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)
    
    
def rl_optimize_policy(num_iter):
    # Optimize value function
    session.run(rl_opt_value_function, feed_dict=rl.feed_dict)

    # select random training batches
    for i in tqdm(range(num_iter)):
        rl.feed_dict[tf_states] = lyapunov.discretization.sample_continuous(1000)

        session.run(adapt_policy, feed_dict=rl.feed_dict)

# Exploration

We explore close to the current policy by sampling the most uncertain state that does not leave the current level set

In [14]:
action_variation = np.array([[-0.02], [0.], [0.02]], dtype=OPTIONS.np_dtype)


with tf.name_scope('add_new_measurement'):
        tf_max_state_action = tf.placeholder(OPTIONS.tf_dtype,
                                             shape=[1, full_dim])
        tf_measurement = true_dynamics(tf_max_state_action)
        
def update_gp():
    """Update the GP model based on an actively selected data point."""
    # Get a new sample location
    max_state_action, _ = safe_learning.get_safe_sample(lyapunov,
                                                        action_variation,
                                                        action_limits,
                                                        num_samples=1000)

    # Obtain a measurement of the true dynamics
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)

    # Add the measurement to our GP dynamics
    lyapunov.dynamics.add_data_point(max_state_action, measurement)
    

# Run the optimization

In [15]:
# lyapunov.update_safe_set()
rl_optimize_policy(num_iter=100)
rl_optimize_policy(num_iter=100)

lyapunov.update_safe_set()
#plot_safe_set(lyapunov)

  if self.max_big_small_squared < big*small**2:
  self.max_big_small_squared = big*small**2


ResourceExhaustedError: MemoryError: 
Traceback (most recent call last):

  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 206, in __call__
    ret = func(*args)

  File "C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\reinforcement_learning.py", line 171, in _run_cvx_optimization
    prob.solve(**solver_options)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\problems\problem.py", line 268, in solve
    return solve_func(self, *args, **kwargs)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\problems\problem.py", line 381, in _solve
    data, inverse_data = self._solving_chain.apply(self)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\chain.py", line 65, in apply
    problem, inv = r.apply(problem)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\qp2quad_form\qp2symbolic_qp.py", line 51, in apply
    return super(Qp2SymbolicQp, self).apply(problem)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 44, in apply
    constraint)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 77, in canonicalize_tree
    canon_expr, c = self.canonicalize_expr(expr, canon_args)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 90, in canonicalize_expr
    return Constant(expr.value), []

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 266, in value
    return self._value_impl()

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 289, in _value_impl
    result = self.numeric(arg_values)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 376, in new_numeric
    result = numeric_func(self, values)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\affine\promote.py", line 67, in numeric
    return np.ones(self.promoted_shape) * values[0]

  File "C:\Users\petar\Anaconda3\lib\site-packages\numpy\core\numeric.py", line 188, in ones
    a = empty(shape, dtype, order)

MemoryError


	 [[node rl_mean_optimization/optimize_value_function/run_cvx_optimization (defined at C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\utilities.py:65)  = PyFunc[Tin=[DT_DOUBLE, DT_DOUBLE], Tout=[DT_DOUBLE], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](function_stack/evaluate/stacked_mean, reward_function/evaluate/Sum)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'rl_mean_optimization/optimize_value_function/run_cvx_optimization', defined at:
  File "C:\Users\petar\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\petar\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\petar\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 478, in start
    self.io_loop.start()
  File "C:\Users\petar\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\petar\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\petar\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\petar\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\petar\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\petar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\petar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-78b7cfcb4847>", line 22, in <module>
    rl_opt_value_function = rl.optimize_value_function()
  File "C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\utilities.py", line 109, in wrapped_function
    return function(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\reinforcement_learning.py", line 209, in optimize_value_function
    **solver_options)
  File "C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\utilities.py", line 65, in wrapped_function
    stateful=stateful, name=name)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 457, in py_func
    func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 281, in _internal_py_func
    input=inp, token=token, Tout=Tout, name=name)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_script_ops.py", line 131, in py_func
    "PyFunc", input=input, token=token, Tout=Tout, name=name)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3274, in create_op
    op_def=op_def)
  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): MemoryError: 
Traceback (most recent call last):

  File "C:\Users\petar\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 206, in __call__
    ret = func(*args)

  File "C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\reinforcement_learning.py", line 171, in _run_cvx_optimization
    prob.solve(**solver_options)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\problems\problem.py", line 268, in solve
    return solve_func(self, *args, **kwargs)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\problems\problem.py", line 381, in _solve
    data, inverse_data = self._solving_chain.apply(self)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\chain.py", line 65, in apply
    problem, inv = r.apply(problem)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\qp2quad_form\qp2symbolic_qp.py", line 51, in apply
    return super(Qp2SymbolicQp, self).apply(problem)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 44, in apply
    constraint)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 74, in canonicalize_tree
    canon_arg, c = self.canonicalize_tree(arg)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 77, in canonicalize_tree
    canon_expr, c = self.canonicalize_expr(expr, canon_args)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\reductions\canonicalization.py", line 90, in canonicalize_expr
    return Constant(expr.value), []

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 266, in value
    return self._value_impl()

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 289, in _value_impl
    result = self.numeric(arg_values)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\atom.py", line 376, in new_numeric
    result = numeric_func(self, values)

  File "C:\Users\petar\Anaconda3\lib\site-packages\cvxpy\atoms\affine\promote.py", line 67, in numeric
    return np.ones(self.promoted_shape) * values[0]

  File "C:\Users\petar\Anaconda3\lib\site-packages\numpy\core\numeric.py", line 188, in ones
    a = empty(shape, dtype, order)

MemoryError


	 [[node rl_mean_optimization/optimize_value_function/run_cvx_optimization (defined at C:\Users\petar\Anaconda3\lib\site-packages\safe_learning\utilities.py:65)  = PyFunc[Tin=[DT_DOUBLE, DT_DOUBLE], Tout=[DT_DOUBLE], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](function_stack/evaluate/stacked_mean, reward_function/evaluate/Sum)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
for i in range(5):
    print('iteration {} with c_max: {}'.format(i, lyapunov.feed_dict[lyapunov.c_max]))
    for i in tqdm(range(10)):
        update_gp()
    
    rl_optimize_policy(num_iter=200)
    lyapunov.update_values()
    
    # Update safe set and plot
    lyapunov.update_safe_set()
    #plot_safe_set(lyapunov)    

# Plot trajectories and analyse improvement

In [None]:
x0 = np.array([[math.pi*0.5, -0.0, 0.5, 0.0]])

states_new, actions_new = safe_learning.utilities.compute_trajectory(true_dynamics, rl.policy, x0, 500)
states_old, actions_old = safe_learning.utilities.compute_trajectory(true_dynamics, init_policy, x0, 500)

t = np.arange(len(states_new)) * true_dynamics.dt

In [None]:
plt.plot(t, states_new[:, 0], label='new')
plt.plot(t, states_old[:, 0], label='old')
plt.xlabel('time [s]')
plt.ylabel('angle [rad]')
plt.legend()
plt.show()

plt.plot(t, states_new[:, 1], label='new')
plt.plot(t, states_old[:, 1], label='old')
plt.xlabel('time [s]')
plt.ylabel('angular velocity [rad/s]')
plt.legend()
plt.show()

In [None]:
plt.plot(t[:-1], actions_new, label='new')
plt.plot(t[:-1], actions_old, label='old')
plt.xlabel('time [s]')
plt.ylabel('actions')
plt.legend()

In [None]:
print('reward old:', tf.reduce_sum(rl.reward_function(states_old[:-1], actions_old)).eval(feed_dict=rl.feed_dict))
print('reward new:', tf.reduce_sum(rl.reward_function(states_new[:-1], actions_new)).eval(feed_dict=rl.feed_dict))