In [None]:
from __future__ import division, print_function

import tensorflow as tf
import GPflow
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
from matplotlib import colors
%matplotlib inline

import safe_learning
import plotting
np.random.seed(0)

try:
    session.close()
except NameError:
    pass

session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())

### Goal:

Optimize over the policy such that the safe set does not shrink

We start by defining a discretization of the space $[-1, 1]$ with discretization constant $\tau$

In [None]:
# x_min, x_max, discretization\
state_limits = np.array([[-1., 1.]])
action_limits = np.array([[-.5, .5]])
num_states = 1000
num_actions = 101

discretization = safe_learning.GridWorld(np.vstack((state_limits, action_limits)),
                                         [num_states, num_actions])
state_disc = safe_learning.GridWorld(state_limits, num_states)
action_disc = safe_learning.GridWorld(action_limits, num_actions)

# action_space = action_disc.all_points
# state_space = state_disc.all_points

# Discretization constant
tau = np.max(state_disc.unit_maxes)

# Initial policy: All zeros
policy = safe_learning.Triangulation(action_disc, np.zeros(len(action_disc)), name='policy')

# initial_policy_id = (np.abs(-0.5 * state_space.squeeze() - action_space)).argmin(axis=0)
# initial_policy = action_space[initial_policy_id]

print('Grid size: {0}'.format(len(state_disc)))

### Define GP dynamics model

In [None]:
kernel = (GPflow.kernels.Matern32(2, lengthscales=1, active_dims=[0, 1]) *
          GPflow.kernels.Linear(2, variance=[0.2, 1], ARD=True))

noise_var = 0.01 ** 2

# Mean dynamics
mean_function = safe_learning.LinearSystem(([1, 0.1]), name='prior_dynamics')

mean_lipschitz = 0.8
gp_lipschitz = 0.5 # beta * np.sqrt(kernel.Mat32.variance) / kernel.Mat32.lengthscale * np.max(np.abs(state_limits))
lipschitz_dynamics = mean_lipschitz + gp_lipschitz

# Define one sample as the true dynamics
# true_dynamics = safe_learning.sample_gp_function(
#     kernel,
#     np.vstack([state_limits, action_limits]),
#     num_samples=20,
#     noise_var=noise_var,
#     mean_function=mf.f)

a = 1.2
b = 1.
q = 1.
r = 0.5

true_dynamics = safe_learning.LinearSystem((a, b), name='true_dynamics')

# Define a GP model over the dynamics
gp = GPflow.gpr.GPR(np.empty((0, 2), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel,
                    mean_function=mean_function)
gp.likelihood.variance = noise_var

dynamics = safe_learning.GaussianProcess(gp, name='gp_dynamics')

k_opt, s_opt = safe_learning.utilities.dlqr(a, b, q, r)

### Define Lyapunov function

In [None]:
# lyapunov_function = safe_learning.QuadraticFunction(np.array([[1]]))
lyapfun_disc = safe_learning.GridWorld(state_limits, 3)
lyapunov_function = safe_learning.Triangulation(lyapfun_disc, vertex_values=[1, 0, 1],
                                                name='lyapunov_function')

# lipschitz_lyapunov = np.max(np.abs(lyapunov_function.gradient(state_space)))
lipschitz_lyapunov = 1.

lyapunov = safe_learning.Lyapunov(state_disc, lyapunov_function, dynamics,
                                  lipschitz_dynamics, lipschitz_lyapunov, tau,
                                  policy)

### Initial safe set 

In [None]:
lyapunov.initial_safe_set = np.abs(lyapunov.discretization.all_points.squeeze()) < 0.05

lyapunov.update_safe_set()
noisy_dynamics = lambda x, u, noise: true_dynamics(x, u)
plotting.plot_lyapunov_1d(lyapunov, noisy_dynamics, legend=True)

### Reinforcement learning for the mean dynamics

In [None]:
# mean_dynamics = dynamics.to_mean_function()

reward = safe_learning.QuadraticFunction(linalg.block_diag(-q, -r), name='reward_function')

rl_disc = safe_learning.GridWorld(state_limits, 21)
value_function = safe_learning.Triangulation(rl_disc,
                                             np.zeros(len(rl_disc)),
                                             project=True,
                                             name='value_function')

rl = safe_learning.PolicyIteration(policy, dynamics, reward, value_function)

### Plot the dynamics

Note that the initial policy is just all zeros!!!

In [None]:
_STORAGE = {}

@safe_learning.utilities.with_scope('get_safe_set')
def get_safe_sets(lyapunov, positive=True):
    
    safe_states = state_disc.index_to_state(np.where(lyapunov.safe_set))
    safe_actions = action_disc.all_points
    feed_dict = lyapunov.feed_dict

    state_actions = np.column_stack([arr.ravel() for arr in
                                     np.meshgrid(safe_states, safe_actions, indexing='ij')])
    safe_set = lyapunov.safe_set.reshape(state_disc.num_points)
    
    c_max = np.max(lyapunov.values[lyapunov.safe_set])
    
    storage = safe_learning.utilities.get_storage(_STORAGE, index=lyapunov)
    
    if storage is None:
        tf_state_actions = tf.placeholder(safe_learning.config.dtype,
                                          shape=[None, state_actions.shape[1]])
        tf_c_max = tf.placeholder(safe_learning.config.dtype, shape=())
    
        mean, bound = lyapunov.dynamics(tf_state_actions)
        values = lyapunov.lyapunov_function(mean) + lyapunov.lipschitz_lyapunov * bound
        maps_inside = tf.less(values, tf_c_max, name='maps_inside_levelset')
    
        dec = lyapunov.v_decrease_bound(tf_state_actions[:, :safe_states.shape[1]],
                                        (mean, bound))
        decreases = tf.less(dec, lyapunov.threshold)
        
        storage = [('tf_c_max', tf_c_max),
                   ('tf_state_actions', tf_state_actions),
                   ('maps_inside', maps_inside),
                   ('mean', mean),
                   ('decreases', decreases)]
        safe_learning.utilities.set_storage(_STORAGE, storage, index=lyapunov)
    else:
        tf_c_max, tf_state_actions, maps_inside, mean, decreases = storage.values()

    # Put placeholder values inside feed_dict
    feed_dict[tf_state_actions] = state_actions
    feed_dict[tf_c_max] = c_max
    
    # Evaluate
    maps_inside, mean, decreases = session.run([maps_inside, mean, decreases],
                                               feed_dict=feed_dict)
    
    # Add the mean safe set on top
    if not positive:
        next_state_index = lyapunov.discretization.state_to_index(mean)
        safe_in_expectation = lyapunov.safe_set[next_state_index]
        maps_inside &= safe_in_expectation
        
    maps_inside_total = np.zeros(discretization.nindex, dtype=np.bool)
    maps_inside_total = maps_inside_total.reshape(discretization.num_points)
    decreases_total = np.zeros_like(maps_inside_total)
    
    maps_inside_total[safe_set, :] = maps_inside.reshape(len(safe_states), len(safe_actions))
    decreases_total[safe_set, :] = decreases.reshape(len(safe_states), len(safe_actions))

    return maps_inside_total, decreases_total


@safe_learning.utilities.with_scope('plot_lyapunov_2d')
def plot_things():
    fig, axes = plt.subplots(2, 2, figsize=(10, 10), gridspec_kw={'width_ratios': [30, 1]})

    # Hide fake cax
    cax, cax1 = axes[:, 1]
    cax1.set_visible(False)
    cax.set_ylabel('Standard deviation $\sigma$')

    ax0, ax1 = axes[:, 0]
    ax0.set_ylabel('action')
    ax1.set_xlabel('state')
    ax1.set_ylabel('$v(\mathbf{x})$')

    ax1.set_ylim(0, np.max(lyapunov.values))
    ax1.set_xlim(state_limits.squeeze())
    ax0.set_xlim(state_limits.squeeze())
    ax0.set_ylim(action_limits.squeeze())
    ax0.set_xticks([])

    # Hide x-ticks of ax0
    plt.setp(ax0.get_xticklabels(), visible=False)

    # width between cax and main axis
    plt.subplots_adjust(wspace=.05)
    feed_dict = lyapunov.feed_dict
        
    # Plot the dynamics
    states = lyapunov.discretization.all_points
    state_actions = discretization.all_points
    
    storage = safe_learning.utilities.get_storage(_STORAGE, index=lyapunov)
    if storage is None:
        actions = lyapunov.policy(states)
        next_states = lyapunov.dynamics(state_actions)
        
        storage = [('actions', actions),
                   ('next_states',next_states)]
        
        safe_learning.utilities.set_storage(_STORAGE, storage, index=lyapunov)
    else:
        actions, next_states = storage.values()
    
    mean, bound = session.run(next_states, feed_dict=feed_dict)
    
    # Show the GP variance
    img = ax0.imshow(bound.reshape(discretization.num_points).T,
                     origin='lower',
                     extent=discretization.limits.ravel(),
                     aspect='auto')
    
    # Plot the dynamics
    ax0.plot(lyapunov.dynamics.X[:, 0],
             lyapunov.dynamics.X[:, 1], 'x')
    cbar = plt.colorbar(img, cax=cax)

    safe, safe_expanders = get_safe_sets(lyapunov)    
    safe = safe.reshape(discretization.num_points)
    v_dec = safe_expanders.reshape(discretization.num_points)
    
    safe_mask = np.ma.masked_where(~safe, safe)
    v_dec_mask = np.ma.masked_where(~v_dec, v_dec)
    

    # Overlay the safety feature
    img = ax0.imshow(safe_mask.T,
                     origin='lower',
                     extent=discretization.limits.ravel(),
                     alpha=0.2,
                     cmap=colors.ListedColormap(['white']),
                     aspect='auto',
                     vmin=0,
                     vmax=1)    
    
    # Overlay the safety feature
    img = ax0.imshow(v_dec_mask.T,
                     origin='lower',
                     extent=discretization.limits.ravel(),
                     alpha=0.5,
                     cmap=colors.ListedColormap(['red']),
                     aspect='auto',
                     vmin=0,
                     vmax=1)
    
    is_safe = lyapunov.safe_set
    # Plot the Lyapunov function
    lyap_safe = np.ma.masked_where(~is_safe, lyapunov.values)
    lyap_unsafe = np.ma.masked_where(is_safe, lyapunov.values)

    # Plot lines for the boundary of the safety feature
    x_min_safe = np.min(states[is_safe])
    x_max_safe = np.max(states[is_safe])

    ax1.plot(states, lyap_safe, 'r')
    ax1.plot(states, lyap_unsafe, 'b')

    kw_axv = {'color': 'red',
              'alpha': 0.5}
    ax0.axvline(x=x_min_safe, ymin=-0.2, ymax=1, clip_on=False, **kw_axv)
    ax1.axvline(x=x_min_safe, ymin=0, ymax=1, clip_on=False, **kw_axv)

    ax0.axvline(x=x_max_safe, ymin=-0.2, ymax=1, clip_on=False, **kw_axv)
    ax1.axvline(x=x_max_safe, ymin=0, ymax=1, clip_on=False, **kw_axv)
    
    # Plot the current policy
    actions = actions.eval(feed_dict=feed_dict)
    ax0.step(states, actions, label='safe policy', alpha=0.5)

    ax0.legend()
    plt.show()

# optimize_safe_policy(lyapunov)
lyapunov.update_safe_set()
plot_things()

## Online learning
As we sample within this initial safe set, we gain more knowledge about the system. In particular, we iteratively select the state withing the safe set, $\mathcal{S}_n$, where the dynamics are the most uncertain (highest variance).

In [None]:
action_space = action_disc.all_points
action_variation = safe_learning.GridWorld(np.array(action_limits) / 10, num_actions).all_points

rl_opt_value_function = rl.optimize_value_function()
for i in range(3):
    rl_opt_value_function.eval(feed_dict=lyapunov.feed_dict)
    rl.discrete_policy_optimization(action_space)
    

@safe_learning.utilities.with_scope('sample_new_safe_point')
def get_safe_samples(lyapunov, positive=False):
    c_max = np.max(lyapunov.values[lyapunov.safe_set])
    
    n_states = state_disc.ndim
    n_actions = action_disc.ndim

    storage = safe_learning.utilities.get_storage(_STORAGE, index=lyapunov)
    
    if storage is None:
        # Predict dynamics
        tf_state_actions = tf.placeholder(safe_learning.config.dtype,
                                          shape=[None, n_states + n_actions])
        tf_safe_states = tf.placeholder(safe_learning.config.dtype,
                                        shape=[None, n_states])
        tf_c_max = tf.placeholder(safe_learning.config.dtype, shape=())

        tf_actions = lyapunov.policy(tf_safe_states)
        mean, bound = lyapunov.dynamics(tf_state_actions)
        values = lyapunov.lyapunov_function(mean) + lyapunov.lipschitz_lyapunov * bound
        maps_inside = tf.less(values, tf_c_max, name='maps_inside_levelset')
        
        storage = [('tf_safe_states', tf_safe_states),
                   ('tf_actions', tf_actions),
                   ('tf_state_actions', tf_state_actions),
                   ('tf_c_max', tf_c_max),
                   ('mean', mean),
                   ('bound', bound),
                   ('maps_inside', maps_inside)]
        safe_learning.utilities.set_storage(_STORAGE, storage, index=lyapunov)
    else:
        (tf_safe_states, tf_actions, tf_state_actions,
         tf_c_max, mean, bound, maps_inside) = storage.values()
        
        
    # Take all the states inside the level set and merge with all actions
    safe_states = state_disc.index_to_state(np.where(lyapunov.safe_set))
    
    feed_dict = lyapunov.feed_dict
    feed_dict[tf_safe_states] = safe_states
    
    actions = tf_actions.eval(feed_dict=feed_dict)
    # Variation relative to current policy
    eps = action_variation

    n, m = actions.shape
    n_variations = len(eps)

    # repeat states
    states_new = np.repeat(safe_states, len(eps), axis=0)

    # generate actions
    actions_new = np.repeat(actions, n_variations, axis=0) + np.tile(eps, (n, 1))

    # within_bounds = np.any(actions_new < limits[:, 0], axis=1) | np.any(actions_new > limits[:, 1], axis=1)
    np.clip(actions_new, action_limits[:, 0], action_limits[:, 1], out=actions_new)

    # Stack to state-action vector
    state_actions = np.column_stack((states_new, actions_new))
    
    
#     state_actions = np.column_stack([arr.ravel() for arr in np.meshgrid(safe_states,
#                                                                         safe_actions,
#                                                                         indexing='ij')])
    
    
    
    # Update feed value
    lyapunov.feed_dict[tf_state_actions] = state_actions
    lyapunov.feed_dict[tf_c_max] = c_max
    
    # Evaluate
    maps_inside, mean, var = session.run([maps_inside, mean, bound],
                                         feed_dict=lyapunov.feed_dict)
    maps_inside = maps_inside.squeeze(-1)
    
    # Check whether states map back to the safe set in expectation
    if not positive:
        next_state_index = lyapunov.discretization.state_to_index(mean)
        safe_in_expectation = lyapunov.safe_set[next_state_index]
        maps_inside &= safe_in_expectation
    
    return state_actions[maps_inside, :], var[maps_inside]


with tf.variable_scope('add_new_measurement'):
        tf_max_state_action = tf.placeholder(safe_learning.config.dtype,
                                             shape=[1, state_disc.ndim + action_disc.ndim])
        tf_measurement = true_dynamics(tf_max_state_action)
        
def update_gp():
    """Update the GP model based on an actively selected data point."""
    rl_opt_value_function.eval(feed_dict=lyapunov.feed_dict)
    rl.discrete_policy_optimization(action_space)
    
    lyapunov.update_safe_set()
    safe_state_actions, dynamics_std = get_safe_samples(lyapunov)

    # Select most uncertain state-action pair
    max_id = np.argmax(dynamics_std)
    max_state_action = safe_state_actions[[max_id], :]

    # Add newly obtained data point to the GP
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)

    lyapunov.dynamics.add_data_point(max_state_action, measurement)
    
    # Optimize the policy for plotting
#     optimize_safe_policy(lyapunov)

update_gp()
plot_things()

In [None]:
for i in range(20):
    update_gp()
    
lyapunov.update_safe_set()
plot_things()

In [None]:
plotting.show_graph(tf.get_default_graph())

### TODO: Only optimize for performance when safe!

In [None]:
# def safety_value(param):
#     """Try to minimize the bound on Delta_V."""
#     policy.parameters = param
    
#     x = lyapunov.discretization
#     u = policy(x)
#     prediction = lyapunov.dynamics(x, u)

# #     if lyapunov.uncertain_dynamics:
# #         v_dot, v_dot_error = lyapunov.v_decrease_confidence(x, prediction)
# #         # Upper bound on V_dot
# #         v_dot_bound = v_dot + v_dot_error
# #     else:
# #         v_dot_bound, _ = lyapunov.v_decrease_confidence(x, prediction)
#     v_dot_bound = lyapunov.v_decrease_bound(x, prediction)

#     return np.sum(v_dot_bound)

# from scipy.optimize import minimize

In [None]:
# res = minimize(safety_value, policy.parameters.copy())
# policy.parameters = res.x
# # print(res)

In [None]:
# lyapunov.policy = policy(lyapunov.discretization)

In [None]:
# plot_things()

# Now that we have the best 'safe' policy, let's find the highest performing one 

In [None]:
# rl.policy = policy(rl.state_space)
# rl.optimize_value_function()
# plt.plot(rl.values)

# def get_safe(policy):    
#     x = lyapunov.discretization
#     u = policy(x)
#     prediction = lyapunov.dynamics(x, u)
#     v_dot_bound = lyapunov.v_decrease_bound(lyapunov.discretization, prediction)

#     return v_dot_bound < lyapunov.threshold - 1e-5

# # current safe states
# safe_states = lyapunov.discretization[get_safe(policy), :]

# def constraint(param):
#     policy.parameters = param
    
#     x = safe_states
#     u = policy(x)
#     prediction = lyapunov.dynamics(x, u)        
#     v_dot_bound = lyapunov.v_decrease_bound(x, prediction)

#     return np.min(-v_dot_bound + lyapunov.threshold)
    
# def fun(param):
#     x = rl.state_space
#     policy.parameters = param
#     u = policy(x)
    
#     reward = rl.reward_function(x, u, x)
#     res = np.sum(reward + rl.value_function(rl.dynamics(x, u)))
#     return -res
    
# def gradient(param):
#     x = rl.state_space
#     policy.parameters = param
    
#     u = policy(x)
    
#     v_grad = rl.value_function.gradient(rl.dynamics(x, u))
#     d_grad = rl.dynamics.gradient(x, u)[:, 1:]
#     p_grad = policy.parameter_derivative(x).toarray()
    
#     r_grad = rl.reward_function.gradient(x, u)[:, 1:]
        
#     gradient = (r_grad + v_grad * d_grad) * p_grad
#     res = np.sum(gradient, axis=0)
#     return -res

# constraint(policy.parameters)
# res = minimize(fun, policy.parameters, jac=gradient, constraints=({'type': 'ineq', 'fun': constraint}), options={'maxiter': 500})

# policy.parameters = res.x

In [None]:
# lyapunov.policy = policy(lyapunov.discretization)
# plot_things()

In [None]:
# x = policy.all_points

# plt.plot(x, policy(x), label='current')
# plt.plot(x, -k_opt * x, label='opt')
# plt.legend()
# plt.show()

In [None]:
import numpy as np

In [None]:
states = np.arange(5)[:, None].astype(np.float)

actions = 10 * np.linspace(0.1, 1, 5, dtype=np.float).reshape(5, 1)



print(state_actions)

In [None]:
b = np.ascontiguousarray(state_actions).view(np.dtype((np.void, state_actions.dtype.itemsize * state_actions.shape[1])))
_, idx = np.unique(b, return_index=True)

state_actions[idx]

In [None]:
np.void?

In [None]:
limits[:, 0]