# Stability Verification for a Cart-Pole System

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
import gpflow
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import safe_learning
from utilities import sample_box, compute_closedloop_response, CartPole, get_max_parameter_change

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
np_dtype = safe_learning.config.np_dtype
tf_dtype = safe_learning.config.dtype

try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()
initialized = False

## Dynamics

In [None]:
# System parameters
m = 0.1     # pendulum mass
M = 0.5     # cart mass
L = 0.3     # pole length

# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# State and action normalizers
x_max = 5.
theta_max = np.deg2rad(20)
x_dot_max = 2.
theta_dot_max = np.deg2rad(5)
u_max = (m + M)*x_dot_max / (10*dt)

state_norm = (x_max, theta_max, x_dot_max, theta_dot_max)
action_norm = (u_max,)

# Define true system and dynamics
true_cart_pole = CartPole(m, M, L, dt, [state_norm, action_norm])
A_true, B_true = true_cart_pole.linearize()
true_dynamics = safe_learning.LinearSystem((A_true, B_true), name='true_dynamics')

# "Wrong" system
m = 0.2     # pendulum mass
M = 0.8     # cart mass
L = 0.25    # pole length
cart_pole = CartPole(m, M, L, dt, [state_norm, action_norm])
A, B = true_cart_pole.linearize()
mean_dynamics = safe_learning.LinearSystem((A, B), name='mean_dynamics')

## Rewards

In [None]:
# State cost matrix
Q = np.diag([1., 1., 1., 1.])

# Action cost matrix
R = np.identity(cart_pole.action_dim)

# Quadratic reward (-cost) function
reward_function = safe_learning.QuadraticFunction(block_diag(-Q, -R), name='reward_function')

## Discretization

In [None]:
state_limits = np.array([[-1., 1.]]*cart_pole.state_dim)
action_limits = np.array([[-1., 1.]]*cart_pole.action_dim)

num_states = [51,]*cart_pole.state_dim
state_discretization = safe_learning.GridWorld(state_limits, num_states)

# Discretization constant
tau = np.min(state_discretization.unit_maxes)

print('Grid size: {}'.format(state_discretization.nindex))
print('Discretization constant: {}'.format(tau))

# tau = 0.002 for inverted pendulum!

## GP Model

In [None]:
m_true = np.hstack((A_true, B_true))
m = np.hstack((A, B))
variances = (m_true - m) ** 2

# Make sure at least some non-zero prior variance is maintained
np.clip(variances, 1e-3, None, out=variances)

noise_var = 0.001 ** 2
full_dim = cart_pole.state_dim + cart_pole.action_dim

# Kernels
# TODO choose variances
# TODO add non-linearities to x and theta kernels?
kernel_x = gpflow.kernels.Linear(full_dim, variance=variances[0, :], ARD=True)

kernel_theta = gpflow.kernels.Linear(full_dim, variance=variances[1, :], ARD=True)

kernel_x_dot = ( gpflow.kernels.Linear(full_dim, variance=variances[2, :], ARD=True) 
                 + (  gpflow.kernels.Linear(1, variance=variances[2, 1], active_dims=[1])               # theta
                    + gpflow.kernels.Polynomial(1, degree=2, variance=variances[2, 3], active_dims=[3]) # theta_dot
                    + gpflow.kernels.Linear(1, variance=variances[2, 4], active_dims=[4])               # u
                 )*gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1]) 
)

kernel_theta_dot = ( gpflow.kernels.Linear(full_dim, variance=variances[3, :], ARD=True)
                     + (  gpflow.kernels.Linear(1, variance=variances[3, 1], active_dims=[1])               # theta
                        + gpflow.kernels.Polynomial(1, degree=2, variance=variances[3, 3], active_dims=[3]) # theta_dot
                        + gpflow.kernels.Linear(1, variance=variances[3, 4], active_dims=[4])               # u
                 )*gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1])
)

# Mean dynamics
mean_function_x = safe_learning.LinearSystem((A[[0], :], B[[0], :]), name='mean_dynamics_x')
mean_function_theta = safe_learning.LinearSystem((A[[1], :], B[[1], :]), name='mean_dynamics_theta')
mean_function_x_dot = safe_learning.LinearSystem((A[[2], :], B[[2], :]), name='mean_dynamics_x_dot')
mean_function_theta_dot = safe_learning.LinearSystem((A[[3], :], B[[3], :]), name='mean_dynamics_theta_dot')

# Define a GP model over the dynamics
gp_x = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), np.empty((0, 1), dtype=np_dtype),
                               kernel_x, mean_function=mean_function_x)

gp_theta = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), np.empty((0, 1), dtype=np_dtype),
                                   kernel_theta, mean_function=mean_function_theta)

gp_x_dot = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), np.empty((0, 1), dtype=np_dtype),
                                   kernel_x_dot, mean_function=mean_function_x_dot)

gp_theta_dot = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), np.empty((0, 1), dtype=np_dtype),
                                       kernel_theta_dot, mean_function=mean_function_theta_dot)

gp_x.likelihood.variance = noise_var
gp_theta.likelihood.variance = noise_var
gp_x_dot.likelihood.variance = noise_var
gp_theta_dot.likelihood.variance = noise_var

# TODO how to compute beta?
beta = 2.
gp_x_fun = safe_learning.GaussianProcess(gp_x, beta)
gp_theta_fun = safe_learning.GaussianProcess(gp_theta, beta)
gp_x_dot_fun = safe_learning.GaussianProcess(gp_x_dot, beta)
gp_theta_dot_fun = safe_learning.GaussianProcess(gp_theta_dot, beta)

# Stack GP functions => block-diagonal kernel matrix
dynamics = safe_learning.FunctionStack((gp_x_fun, gp_theta_fun, gp_x_dot_fun, gp_theta_dot_fun))

## Lyapunov Function

In [None]:
# Fix policy to the LQR solution for the "wrong" system
K, P = safe_learning.utilities.dlqr(A_true, B_true, Q, R)
policy = safe_learning.LinearSystem(-K, name='policy')
policy = safe_learning.Saturation(policy, -1, 1)

# Define the Lyapunov function corresponding to the known policy
lyapunov_function = safe_learning.QuadraticFunction(P)
grad_lyapunov_function = safe_learning.LinearSystem((-2*P,))

# Lipschitz constants
L_pol = lambda s: tf.constant(np.linalg.norm(-K, 1), dtype=tf_dtype)
L_dyn = lambda s: np.linalg.norm(A_true, 1) + np.linalg.norm(B_true, 1)*L_pol(s)

# Approximate Lipschitz constant for value function over [-1, 1]**d
# L_V = 1.01*np.max(np.abs(session.run(lyapunov_function(states), {states: state_discretization.all_points})))
# L_V = lambda s: tf.norm(grad_lyapunov_function(s), ord=1, axis=1, keep_dims=True)
L_V = lambda s: tf.reduce_max(tf.abs(grad_lyapunov_function(s)), axis=1, keep_dims=True)

lyapunov = safe_learning.Lyapunov(state_discretization, lyapunov_function, dynamics, L_dyn, L_V, tau, policy)

In [None]:
tf_states = tf.placeholder(tf_dtype, shape=[None, cart_pole.state_dim], name='states')
tf_actions = policy(tf_states)

n_points = [51, 51]
x_fix, theta_fix, x_dot_fix, theta_dot_fix = [0, 0, 0, 0]

fig = plt.figure(figsize=(12, 5), dpi=100)
fig.subplots_adjust(wspace=0.1, hspace=0.2)
xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]

# Fix x_dot and theta_dot, plot value function over x and theta
grid = np.column_stack((xx.ravel(), yy.ravel(), 
                        x_dot_fix*np.ones_like(xx.ravel()), theta_dot_fix*np.ones_like(yy.ravel())))
control = session.run(tf_actions, feed_dict={tf_states: grid}).reshape(n_points)
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(xx, yy, control, alpha=0.75)
ax.set_title(r'Control, $\dot{x} = %.3g,\ \dot{\theta} = %.3g$' % (x_dot_fix, theta_dot_fix), fontsize=16)
ax.set_xlabel(r'$x$', fontsize=14)
ax.set_ylabel(r'$\theta$', fontsize=14)
ax.set_zlabel(r'$u$', fontsize=14)
# ax.view_init(elev=20., azim=15.)

# Fix x and theta, plot value function over x_dot and theta_dot
grid = np.column_stack((x_fix*np.ones_like(xx.ravel()), theta_fix*np.ones_like(yy.ravel()), 
                        xx.ravel(), yy.ravel()))
control = session.run(tf_actions, feed_dict={tf_states: grid}).reshape(n_points)
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(xx, yy, control, alpha=0.75)
ax.set_title(r'Control, $x = %.3g,\ \theta = %.3g$' % (x_fix, theta_fix), fontsize=16)
ax.set_xlabel(r'$\dot{x}$', fontsize=14)
ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
ax.set_zlabel(r'$u$', fontsize=14)
# ax.view_init(elev=20., azim=100.)

plt.show()

## Initial Safe Set

In [None]:
# Set initial safe set (level set) based on Lyapunov function
values = session.run(lyapunov_function(tf_states), {tf_states: state_discretization.all_points})
cutoff = 0.01 * np.max(values)

lyapunov.initial_safe_set = np.squeeze(values, axis=1) <= cutoff
lyapunov.update_safe_set()

c_max = lyapunov.feed_dict[lyapunov.c_max]
safe_set_size_init = np.sum(lyapunov.safe_set)

print('Cutoff: {}'.format(cutoff))
print('c_max: {}'.format(c_max))
print('Safe set size: {}'.format(safe_set_size_init))

# TODO debugging
safe_states = state_discretization.all_points[lyapunov.safe_set, :]
tf_mean, tf_var = lyapunov.dynamics(tf_states, tf_actions)
tf_bound = tf.reduce_sum(tf_var, axis=1, keep_dims=True)

tf_lf = lyapunov.lipschitz_dynamics(tf_states)
tf_lv = lyapunov.lipschitz_lyapunov(tf_mean)
tf_error = tf_lv * tf_lf * tf_bound
tf_mean_vals = lyapunov.lyapunov_function(tf_mean) 
tf_values = tf_mean_vals + tf_error

tf_maps_inside = tf.less(tf_values, lyapunov.c_max)

lyapunov.feed_dict.update({tf_states: safe_states})
mean, var, bound, lf, lv, error, mean_vals, vals, maps_inside = session.run([tf_mean, tf_var, tf_bound, tf_lf, tf_lv, tf_error, 
                                                                  tf_mean_vals, tf_values, tf_maps_inside], 
                                                                 lyapunov.feed_dict)

In [None]:
idx = maps_inside.squeeze(axis=1)
bound_safe = bound[idx]
print(idx)
print(np.sum(idx))
print(bound.shape)

print('Policy Lipschitz constant (L_pi): {}'.format(L_pol(0).eval()))
print('Dynamics Lipschitz constant (L_f): {}'.format(L_dyn(0).eval()))

dV = session.run(grad_lyapunov_function(tf_states), {tf_states: safe_states})
lv = session.run(L_V(tf_states), {tf_states: safe_states})

print(mean)
print(error)
print(c_max)
print(np.max(mean_vals))
print(vals)

# From inverted pendulum:
# a_true, b_true = true_dynamics.linearize()
# lipschitz_dynamics = lambda x: np.max(np.abs(a_true)) + np.max(np.abs(b_true)) * lipschitz_policy(x)


In [None]:
def plot_safe_set(lyapunov, show=True):
    """Plot the safe set for a given Lyapunov function."""
    pos_set = np.logical_and(state_discretization.all_points[:, 2] == 0, state_discretization.all_points[:, 3] == 0)
    vel_set = np.logical_and(state_discretization.all_points[:, 0] == 0, state_discretization.all_points[:, 1] == 0)
    safe_pos = lyapunov.safe_set[pos_set].reshape(num_states[:2])
    safe_vel = lyapunov.safe_set[vel_set].reshape(num_states[2:])
    
    fig = plt.figure(figsize=(10, 5), dpi=100)
    fig.suptitle('Safe Set')
    fig.subplots_adjust(wspace=0.5, hspace=0.5)

    # Safe positions set, with zero velocities
    ax = fig.add_subplot(121)
    ax.set_title(r'$\dot{x} = \dot{\theta} = 0$')
    ax.set_xlabel(r'$x$')
    ax.set_ylabel(r'$\theta$')
    im = ax.imshow(safe_pos.T,
                   origin='lower',
                   extent=lyapunov.discretization.limits[:2, :].ravel(),
                   vmin=0,
                   vmax=1)
    fig.colorbar(im)
    
    # Safe velocities set, with zero positions
    ax = fig.add_subplot(122)
    ax.set_title(r'$x = \theta = 0$')
    ax.set_xlabel(r'$\dot{x}$')
    ax.set_ylabel(r'$\dot{\theta}$')
    im = ax.imshow(safe_vel.T,
                   origin='lower',
                   extent=lyapunov.discretization.limits[2:, :].ravel(),
                   vmin=0,
                   vmax=1)
    fig.colorbar(im)
    
#     if isinstance(lyapunov.dynamics, safe_learning.UncertainFunction):
#         X = lyapunov.dynamics.functions[0].X
#         plt.plot(X[:, 0], X[:, 1], 'rx')
    
    if show:
        plt.show()

plot_safe_set(lyapunov)

## Online Learning and Exploration

In [None]:
action_variation = np.array([[-0.01], [0.], [0.01]], dtype=np_dtype)
# action_variation = np.array([[0.]], dtype=np_dtype)

with tf.name_scope('add_new_measurement'):
    full_dim = cart_pole.state_dim + cart_pole.action_dim 
    tf_max_state_action = tf.placeholder(tf_dtype, shape=[1, full_dim])
    tf_measurement = true_dynamics(tf_max_state_action)
    
accepted_samples = np.zeros((0, full_dim))
        
def update_gp(accepted_samples):
    """Update the GP model based on an actively selected data point."""
    # Get a new sample location
    max_state_action, _ = safe_learning.get_safe_sample(lyapunov,
                                                        action_variation,
                                                        action_limits,
                                                        positive=True,
                                                        num_samples=1000)
    # Obtain a measurement of the true dynamics
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)
    
    accepted_samples = np.concatenate((accepted_samples, max_state_action))
    
    # Add the measurement to our GP dynamics
    lyapunov.dynamics.add_data_point(max_state_action, measurement)
    
    return accepted_samples

In [None]:
max_iters = 10
for i in tqdm(range(max_iters)):
    accepted_samples = update_gp(accepted_samples)

# Update safe set
lyapunov.update_safe_set()
print('Safe set size: {}'.format(np.sum(lyapunov.safe_set)))
print('Growth: {}'.format(np.sum(lyapunov.safe_set) - safe_set_size_init))

In [None]:
print(accepted_samples)
print(accepted_samples[:,:-1].dot(-K.T))

# Plot new safe set
plot_safe_set(lyapunov)

In [None]:
tf_next_states = lyapunov.dynamics(tf_states, tf_actions)
tf_decrease = lyapunov.v_decrease_bound(tf_states, tf_next_states)
tf_threshold = lyapunov.threshold(tf_states)
tf_negative = tf.squeeze(tf.less(tf_decrease, tf_threshold), axis=1)

lyapunov.feed_dict.update({tf_states: accepted_samples[:,:-1]})
decrease, threshold, negative = session.run([tf_decrease, tf_threshold, tf_negative], lyapunov.feed_dict)

print(decrease)
print(threshold)
print(negative)