In [None]:
from __future__ import division, print_function

import GPy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
%matplotlib inline

# Try to import safe_rl from system
# if it fails get it from the main folder directly instead.
import utilities
from plotting import plot_lyapunov_1d

# If library not installed, import it from '../'
safe_learning = utilities.import_from_directory('safe_learning', '../')

### Goal:

Optimize over the policy such that the safe set does not shrink

We start by defining a discretization of the space $[-1, 1]$ with discretization constant $\tau$

In [None]:
# x_min, x_max, discretization
state_limits = np.array([[-1., 1.]])
action_limits = np.array([[-1., 1.]])
num_states = 1000
num_actions = 101

discretization = safe_learning.GridWorld(np.vstack((state_limits, action_limits)),
                                         [num_states, num_actions])
action_space = safe_learning.GridWorld(action_limits, num_actions).all_points
state_space = safe_learning.GridWorld(state_limits, num_states).all_points

# Discretization constant
tau = discretization.unit_maxes.squeeze()[0]

# Initial policy: All zeros
policy = action_space[num_actions // 2] * np.ones((len(state_space), 1))

print('Grid size: {0}'.format(len(state_space)))

### Define GP dynamics model

In [None]:
kernel = (GPy.kern.Matern32(2, lengthscale=1, active_dims=[0, 1]) *
          GPy.kern.Linear(2, variances=[1.2, 1], ARD=True))
# kernel = GPy.kern.Linear(2, variances=0.004**2)
noise_var = 0.01 ** 2
beta = 2.

# Mean dynamics
mf = GPy.core.Mapping(2, 1)
mf.f = lambda x: x[:, [0]]
mf.update_gradients = lambda a, b: None

mean_lipschitz = 0.8
gp_lipschitz = beta * np.sqrt(kernel.Mat32.variance) / kernel.Mat32.lengthscale * np.max(np.abs(state_limits))
lipschitz_dynamics = mean_lipschitz + gp_lipschitz

# Define one sample as the true dynamics
# true_dynamics = safe_learning.sample_gp_function(
#     kernel,
#     np.vstack([state_limits, action_limits]),
#     num_samples=20,
#     noise_var=noise_var,
#     mean_function=mf.f)

def true_dynamics(x, u, noise=False):
    x_next = 1.2 * x + u
    if noise:
        x_next += np.sqrt(noise_var) * np.random.randn(*x.shape)
    return 1.2 * x + u

# Define a GP model over the dynamics
gp = GPy.models.GPRegression(np.array([[0, 0]]),
                             np.array([[0]]),
                             kernel,
                             noise_var=noise_var,
                             mean_function=mf)

dynamics = safe_learning.GPyGaussianProcess(gp, beta=beta)

### Define Lyapunov function

In [None]:
# lyapunov_function = safe_learning.QuadraticFunction(np.array([[1]]))
lyapunov_function = safe_learning.Triangulation(state_limits, 3, vertex_values=[1, 0, 1])
lypschitz_lyapunov = np.max(np.abs(lyapunov_function.gradient(state_space)))

lyapunov = safe_learning.LyapunovDiscrete(state_space, lyapunov_function, dynamics,
                                          lipschitz_dynamics, lypschitz_lyapunov, tau,
                                          initial_set=None, policy=policy)

# Specify the desired accuracy
accuracy = np.max(lyapunov.V) / 1e10

### Initial safe set 

In [None]:
lyapunov.initial_safe_set = np.abs(lyapunov.discretization.squeeze()) < 0.05

lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=True)

### Reinforcement learning for the mean dynamics

In [None]:
mean_dynamics = dynamics.to_mean_function()

def reward(state, action, next_state):
    """The reward."""
    values = -(state ** 2)
    return values

def reward_derivative(state, action):
    return -state

reward_function = safe_learning.DeterministicFunction.from_callable(
    reward, reward_derivative)

value_function = safe_learning.Triangulation(state_limits, num_states, project=True)
value_function.vertex_values = np.ones(value_function.nindex)

rl = safe_learning.PolicyIteration(
    state_space,
    action_space,
    mean_dynamics,
    reward_function,
    function_approximator=value_function,
    gamma=0.9,
    terminal_states=np.abs(state_space[:, 0]) <= 0.05)

### Plot the dynamics

Note that the initial policy is just all zeros!!!

In [None]:
# inputs: lyapunov

def get_safe_actions(lyapunov):
    """Compute safe state-action pairs on a grid.
    
    Returns
    -------
    safe : ndarray
        Each row represents on state, each column one action.
    """
    safe = np.empty((num_states, num_actions), dtype=np.bool)
    constant_policy = np.array([0], dtype=np.float)
    policy_array = np.broadcast_to(constant_policy, (num_states, 1))

    for i, u in enumerate(action_space):
        constant_policy[:] = u
        safe[:, i] = lyapunov.safety_constraint(policy_array)
        
    return safe


def plot_things():
    fig, axes = plt.subplots(2, 2, gridspec_kw={'width_ratios': [30, 1]})

    # Hide fake cax
    cax, cax1 = axes[:, 1]
    cax1.set_visible(False)

    ax0, ax1 = axes[:, 0]
    ax0.set_ylabel('action')
    ax1.set_xlabel('state')
    ax1.set_ylabel('$v(x)$')

    ax1.set_ylim(0, np.max(lyapunov.V))
    ax1.set_xlim(state_limits.squeeze())
    ax0.set_xlim(state_limits.squeeze())
    ax0.set_ylim(action_limits.squeeze())
    ax0.set_xticks([])

    # Hide x-ticks of ax0
    plt.setp(ax0.get_xticklabels(), visible=False)

    # width between cax and main axis
    plt.subplots_adjust(wspace=.05)

    # Plot the dynamics
    mean, bound = lyapunov.dynamics(discretization.all_points)
    img = ax0.imshow(bound.reshape(discretization.num_points).T,
                     origin='lower',
                     extent=discretization.limits.ravel(),
                     aspect='auto')
    ax0.plot(lyapunov.dynamics.gaussian_process.X[:, 0],
             lyapunov.dynamics.gaussian_process.X[:, 1], 'x')
    plt.colorbar(img, cax=cax)

    safe = get_safe_actions(lyapunov)    
    safe_mask = np.ma.masked_where(~safe, safe)
    cmap = colors.ListedColormap(['white'])

    # Overlay the safety feature
    img = ax0.imshow(safe_mask.T,
                     origin='lower',
                     extent=discretization.limits.ravel(),
                     alpha=0.5, cmap=cmap,
                     aspect='auto')

    lyapunov.v_dot_negative = np.any(safe, axis=1)
    cmax = lyapunov.max_safe_levelset(accuracy)
    is_safe = lyapunov.V <= cmax

    # Plot the Lyapunov function
    lyap_safe = np.ma.masked_where(~is_safe, lyapunov.V)
    lyap_unsafe = np.ma.masked_where(is_safe, lyapunov.V)

    # Plot lines for the boundary of the safety feature
    x_min_safe = np.min(lyapunov.discretization[is_safe])
    x_max_safe = np.max(lyapunov.discretization[is_safe])

    ax1.plot(lyapunov.discretization, lyap_safe, 'r')
    ax1.plot(lyapunov.discretization, lyap_unsafe, 'b')

    kw_axv = {'color': 'red',
              'alpha': 0.5}
    ax0.axvline(x=x_min_safe, ymin=-0.2, ymax=1, clip_on=False, **kw_axv)
    ax1.axvline(x=x_min_safe, ymin=0, ymax=1, clip_on=False, **kw_axv)

    ax0.axvline(x=x_max_safe, ymin=-0.2, ymax=1, clip_on=False, **kw_axv)
    ax1.axvline(x=x_max_safe, ymin=0, ymax=1, clip_on=False, **kw_axv)
    
    # Plot the current policy
    ax0.plot(state_space, lyapunov.policy, label='policy')

    ax0.legend()
    plt.show()
    
plot_things()

## Online learning
As we sample within this initial safe set, we gain more knowledge about the system. In particular, we iteratively select the state withing the safe set, $\mathcal{S}_n$, where the dynamics are the most uncertain (highest variance).

In [None]:
def update_gp():
    """Update the GP model based on an actively selected data point."""
#     lyapunov.update_safe_set(accuracy=accuracy)
#     safe_set = lyapunov.safe_set
    safe = get_safe_actions(lyapunov) 
    lyapunov.v_dot_negative = np.any(safe, axis=1)
    cmax = lyapunov.max_safe_levelset(accuracy)    
    safe[lyapunov.V > cmax, :] = False
    
    # Maximum uncertainty in safe set
    idx, idu = np.where(safe)
    safe_states = state_space[idx, :]
    safe_actions = action_space[idu, :]
    
    _, dynamics_std = lyapunov.dynamics(safe_states, safe_actions)
    
    # Select most uncertain state-action pair
    max_id = np.argmax(dynamics_std)
    max_state = safe_states[[max_id], :]
    max_input = safe_actions[[max_id], :]
    
    # Add newly obtained data point to the GP
    measurement = true_dynamics(max_state, max_input, noise=True)[:, [0]]
    a = np.hstack((max_state, max_input))
    lyapunov.dynamics.add_data_point(a, measurement)

In [None]:
# Update the GP model a couple of times
for i in range(25):
    update_gp()
    
plot_things()

### With the initial policy things still look bad

In [None]:
# Plot the new safe set
lyapunov.update_safe_set(accuracy=accuracy)
plot_lyapunov_1d(lyapunov, true_dynamics, legend=True)

### So let's update that policy!

In [None]:
# for i in range(5):
#     rl.optimize_value_function()
#     rl.update_policy(constraint=lyapunov.safety_constraint)

# # Update the policy wherever safe
# safe = lyapunov.safety_constraint(rl.policy)
# safe &= ~lyapunov.initial_safe_set
# lyapunov.policy[safe] = rl.policy[safe]

# lyapunov.update_safe_set(accuracy=accuracy)
# plot_lyapunov_1d(lyapunov, true_dynamics, legend=False)

In [None]:
# plot_things()

In [None]:
rl.dynamics.gradient(0, 0)

In [None]:
policy = safe_learning.Triangulation(state_limits, 5)
policy.parameters = np.zeros_like(policy.all_points)

### TODO: Only optimize for performance when safe!

In [None]:
# def fun(param, x):
#     x = rl.state_space
#     policy.parameters = param
#     u = policy(x)
    
#     reward = rl.reward_function(x, u, x)
#     res = np.sum(reward + rl.value_function(rl.dynamics(x, u)))
#     return res
    
# def gradient(param):
#     x = rl.state_space
#     policy.parameters = param
    
#     u = policy(x)
    
#     v_grad = rl.value_function.gradient(rl.dynamics(x, u))
#     d_grad = rl.dynamics.gradient(x, u)[:, 1:]
#     p_grad = policy.parameter_derivative(x).toarray()
    
#     r_grad = rl.reward_function.gradient(x, u)[:, 1:]
        
#     gradient = (r_grad + v_grad * d_grad) * p_grad
#     res = np.sum(gradient, axis=0)
#     return res

# def constraint(param):
#     policy.parameters = param
    
#     x = lyapunov.discretization
#     u = policy(x)
#     prediction = lyapunov.dynamics(x, u)

#     if lyapunov.uncertain_dynamics:
#         v_dot, v_dot_error = lyapunov.v_decrease_confidence(*prediction)
#         # Upper bound on V_dot
#         v_dot_bound = v_dot + v_dot_error
#     else:
#         v_dot_bound, _ = lyapunov.v_decrease_confidence(prediction)

#     return np.sum(v_dot_bound)
    

def safety_value(param):
    """Try to minimize the bound on Delta_V."""
    policy.parameters = param
    
    x = lyapunov.discretization
    u = policy(x)
    prediction = lyapunov.dynamics(x, u)

    if lyapunov.uncertain_dynamics:
        v_dot, v_dot_error = lyapunov.v_decrease_confidence(*prediction)
        # Upper bound on V_dot
        v_dot_bound = v_dot + v_dot_error
    else:
        v_dot_bound, _ = lyapunov.v_decrease_confidence(prediction)

    return np.sum(v_dot_bound)

from scipy.optimize import minimize

In [None]:
minimize(safety_value, policy.parameters.copy())

In [None]:
lyapunov.policy = policy(lyapunov.discretization)

In [None]:
plot_things()