In [None]:
from __future__ import division, print_function

import GPy
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
%matplotlib inline

# Try to import safe_rl from system
# if it fails get it from the main folder directly instead.
import utilities
from plotting import plot_lyapunov_1d

# If library not installed, import it from '../'
safe_learning = utilities.import_from_directory('safe_learning', '../')

### Goal:

Optimize over the policy such that the safe set does not shrink

We start by defining a discretization of the space $[-1, 1]$ with discretization constant $\tau$

In [None]:
# x_min, x_max, discretization
extent = np.array([-1., 1.])
state_limits = extent
num_states = 1000
action_limits = np.array([-1., 1.])
num_actions = 51

action_space = np.linspace(action_limits[0], action_limits[1], num_actions)[:, None]

# Create a grid
discretization = safe_learning.GridWorld([extent], num_states)
grid = discretization.all_points
state_space = grid

# Discretization constant
tau = discretization.unit_maxes.squeeze()

# Initial policy: All zeros
policy = action_space[num_actions // 2] * np.ones((len(discretization), 1))

print('Grid size: {0}'.format(len(grid)))

We define the GP model using one particular sample of the GP, in addition to a stable, closed-loop, linear model.
$$x_{l+1} = 0.25 x_k + g_\pi(x),$$

The prior dynamics are locally asymptotically stable. Moreover, in the one-dimensional case, the dynamics are stable as long as $|x_{k+1}| \leq |x_{k}|$.

In [None]:
kernel = GPy.kern.Matern32(2, lengthscale=0.2, variance=0.4**2) * GPy.kern.Linear(2, variances=1)
# kernel = GPy.kern.Linear(2, variances=0.004**2)
# Observation noise
noise_var = 0.01 ** 2

# Mean dynamics
mf = GPy.core.Mapping(2, 1)
mf.f = lambda x: 0.8 * x[:, [0]] + x[:, [1]]
mf.update_gradients = lambda a, b: None

# Define one sample as the true dynamics
true_dynamics = safe_learning.sample_gp_function(
    kernel,
    [state_limits, action_limits],
    num_samples=20,
    noise_var=noise_var,
    interpolation='kernel',
    mean_function=mf.f)

# Define a GP model over the dynamics
gp = GPy.models.GPRegression(np.array([[0, 0]]),
                             np.array([[0]]),
                             kernel,
                             noise_var=noise_var,
                             mean_function=mf)

Additionally we define a random lyapunov function. Unlike for multiple dimensions, in the one-dimensional case all radially increasing functions are equivalent. Here we pick
$$V(x) = x^2$$
The previous GP model defines a GP model over $\dot{V}(x) = \frac{\partial V(x)}{\partial x} f(x)$. In the following, we only consider the 2-$\sigma$ upper confidence bound of this model. Since the dynamics are Lipschitz continuous, $\dot{V}$ is Lipschitz continuous as well.

In particular, we use Lemma 5 to determine an appropriate Lipschitz constant. For the sample path of the GP, we use the high-probability Lipschitz constant encoded by the kernel.

In [None]:
# lyapunov_function = safe_learning.QuadraticFunction(np.array([[1]]))
# n_linear = 10
# lyapunov_function = safe_learning.Triangulation(extent[None, :], n_linear)
# points = lyapunov_function.index_to_state(np.arange(lyapunov_function.nindex))
# lyapunov_function.vertex_values = points.squeeze() ** 2
lyapunov_function = safe_learning.Triangulation(state_limits[None, :], 3, vertex_values=[1, 0, 1])

dynamics = safe_learning.GPyGaussianProcess(gp, beta=2.)

# Lipschitz constant
L_dyn = 0.25 + dynamics.beta(0) * np.sqrt(gp.kern.Mat32.variance) / gp.kern.Mat32.lengthscale * np.max(np.abs(extent))
L_V = np.max(lyapunov_function.gradient(grid))

lyapunov = safe_learning.LyapunovDiscrete(grid, lyapunov_function, dynamics, L_dyn, L_V, tau, initial_set=None, policy=policy)

# Specify the desired accuracy
accuracy = np.max(lyapunov.V) / 1e10

### RL

In [None]:
mean_dynamics = dynamics.to_mean_function()

def reward_function(state, action, next_state):
    """The reward."""
    values = -(state ** 2)
    values[np.abs(state[:, 0]) <= 0.1] = 0
    return values

value_function = safe_learning.Triangulation(state_limits[None, :], num_states, project=True)
value_function.vertex_values = np.ones(value_function.nindex)

rl = safe_learning.PolicyIteration(
    state_space,
    action_space,
    mean_dynamics,
    reward_function,
    function_approximator=value_function,
    gamma=0.9,
    terminal_states=np.abs(state_space[:, 0]) <= 0.1)

In [None]:
for i in range(10):
    rl.update_value_function()
    rl.update_policy()

In [None]:
plt.plot(rl.state_space, rl.values)
plt.show()
plt.plot(rl.state_space, rl.policy)
plt.show()
plt.plot(rl.state_space, mean_dynamics.evaluate(rl.state_space, rl.policy))
plt.show()

## Safety based on GP model

Let's start by plotting the prior over the dynamics and the associated prior over $\dot{V}(x)$.

In [None]:
lyapunov.initial_safe_set = np.abs(lyapunov.discretization.squeeze()) < 0.1

lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=True)

## Online learning
As we sample within this initial safe set, we gain more knowledge about the system. In particular, we iteratively select the state withing the safe set, $\mathcal{S}_n$, where the dynamics are the most uncertain (highest variance).

In [None]:
def update_gp():
    """Update the GP model based on an actively selected data point."""
    lyapunov.update_safe_set(accuracy=accuracy)
    safe_set = lyapunov.safe_set
    
    # Maximum uncertainty in safe set
    dynamics_mean, dynamics_std = lyapunov.dynamics(grid, policy)
    max_id = np.argmax(dynamics_std[safe_set])
    max_state = grid[safe_set][[max_id], :]
    max_input = policy[safe_set][[max_id], :]
    
    # Add newly obtained data point to the GP
    measurement = true_dynamics(max_state, max_input, noise=True)[:, [0]]
    a = np.hstack((max_state, max_input))
    lyapunov.dynamics.add_data_point(a, measurement)
    return safe_set

In [None]:
# Update the GP model a couple of times
for i in range(4):
    update_gp()

In [None]:
# Plot the new safe set
lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=True)

We continue to sample like this, until we find the maximum safe set

In [None]:
for i in range(20):
    update_gp()

lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=False)

In [None]:
for i in range(5):
    rl.optimize_value_function()
    rl.update_policy(constraint=lyapunov.safety_constraint)

# Update the policy wherever safe
safe = lyapunov.safety_constraint(rl.policy)
safe &= ~lyapunov.initial_safe_set
lyapunov.policy[safe] = rl.policy[safe]

lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=False)