# Stability Verification for a Cart-Pole System

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
import gpflow
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import safe_learning
from utilities import CartPole, debug, visuals

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
np_dtype = safe_learning.config.np_dtype
tf_dtype = safe_learning.config.dtype

try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()

# TODO debug flags ***************************************#

import pandas
pandas.options.display.float_format = '{:,.4f}'.format
pandas.set_option('expand_frame_repr', False)
np.set_printoptions(precision=4)
np.seterr(divide='ignore', invalid='ignore')

# Saturate the action so that it lies in [-1, 1]
SATURATE = True

# Use the true physical parameters in the GP model
USE_TRUE_PARAMETERS = True

# Use the linearized discrete-time model as the true underlying dynamics
USE_LINEAR_DYNAMICS = True

# Use a threshold of zero when checking for stability
USE_ZERO_THRESHOLD = True

#
USE_LINEAR_KERNELS = True

#
USE_LIPSCHITZ_SCALING = True

# Scaling factor for GP confidence intervals
BETA = 2.

#
TRAIN_HYPERPARAMETERS = False

#
GP_SCALING = 1e3

#
NOISE_VAR = 0.0001 ** 2

#******************************************************#


## Dynamics

In [None]:
# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# m = 0.1
# M = 1.5
# L = 0.5
# b = 0.0
# x_max = 0.25
# theta_max = np.deg2rad(20)
# v_max = 0.5
# omega_max = np.deg2rad(10)
# u_max = (m + M)*v_max / (5*dt)

# True system parameters
m = 0.175    # pendulum mass
M = 1.732    # cart mass
L = 0.28     # pole length
b = 0.0      # rotational friction

# State and action normalizers
x_max = 0.5
theta_max = np.deg2rad(30)
v_max = 1. # 5.
omega_max = 0.25*np.sqrt(g / L)
u_max = 190

state_norm = (x_max, theta_max, v_max, omega_max)
action_norm = (u_max, )

# Constraints for initial 'safe' states
x_safe = x_max
theta_safe = np.deg2rad(8)
v_safe = 0.1
omega_safe = np.deg2rad(6)

# Dimensions and domains
state_dim = 4
action_dim = 1
state_limits = np.array([[-1., 1.]]*state_dim)
action_limits = np.array([[-1., 1.]]*action_dim)

# True system
true_cartpole = CartPole(m, M, L, b, dt, [state_norm, action_norm])
A_true, B_true = true_cartpole.linearize()

if USE_LINEAR_DYNAMICS:
    true_dynamics = safe_learning.functions.LinearSystem((A_true, B_true), name='true_dynamics')
else:
    true_dynamics = true_cartpole.__call__

# "Wrong" system
m = 0.2     # pendulum mass
M = 1.5     # cart mass
L = 0.25    # pole length
b = 0.0     # rotational friction
cartpole = CartPole(m, M, L, b, dt, [state_norm, action_norm])
A, B = cartpole.linearize()

if USE_TRUE_PARAMETERS:
    A = A_true
    B = B_true
mean_dynamics = safe_learning.LinearSystem((A, B), name='mean_dynamics')

In [None]:
print(state_norm)
print(action_norm)

## GP Model

In [None]:
m_true = np.hstack((A_true, B_true))
m = np.hstack((A, B))
variances = (m_true - m) ** 2

# Make sure at least some non-zero prior variance is maintained
np.clip(variances, 1e-3, None, out=variances)

# Measurement noise
noise_var = NOISE_VAR

# Input to GP is of the form (x,u)
full_dim = state_dim + action_dim

# Kernels
if USE_LINEAR_KERNELS:
    kernel_x = gpflow.kernels.Linear(full_dim, variance=variances[0, :], ARD=True)
    
    kernel_theta = gpflow.kernels.Linear(full_dim, variance=variances[1, :], ARD=True)

    kernel_v = gpflow.kernels.Linear(full_dim, variance=variances[2, :], ARD=True)

    kernel_omega = gpflow.kernels.Linear(full_dim, variance=variances[3, :], ARD=True)

else:
    kernel_x = (gpflow.kernels.Linear(full_dim, variance=variances[0, :], ARD=True)
                + (  gpflow.kernels.Linear(1, variance=variances[0, 1], active_dims=[1])                   # theta
                   + gpflow.kernels.Matern32(1, variance=variances[0, 3], lengthscales=1, active_dims=[3]) # omega
                   + gpflow.kernels.Linear(1, variance=variances[0, 4], active_dims=[4])                   # u
                 ) * gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1]) 
    )

    kernel_theta = (gpflow.kernels.Linear(full_dim, variance=variances[1, :], ARD=True)
                    + (  gpflow.kernels.Linear(1, variance=variances[1, 1], active_dims=[1])                   # theta
                       + gpflow.kernels.Matern32(1, variance=variances[1, 3], lengthscales=1, active_dims=[3]) # omega
                       + gpflow.kernels.Linear(1, variance=variances[1, 4], active_dims=[4])                   # u
                     ) * gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1])
    )

    kernel_v = (gpflow.kernels.Linear(full_dim, variance=variances[2, :], ARD=True) 
                + (  gpflow.kernels.Linear(1, variance=variances[2, 1], active_dims=[1])                   # theta
                   + gpflow.kernels.Matern32(1, variance=variances[2, 3], lengthscales=1, active_dims=[3]) # omega
                   + gpflow.kernels.Linear(1, variance=variances[2, 4], active_dims=[4])                   # u
                 ) * gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1]) 
    )

    kernel_omega = (gpflow.kernels.Linear(full_dim, variance=variances[3, :], ARD=True)
                    + (  gpflow.kernels.Linear(1, variance=variances[3, 1], active_dims=[1])                   # theta
                       + gpflow.kernels.Matern32(1, variance=variances[3, 3], lengthscales=1, active_dims=[3]) # omega
                       + gpflow.kernels.Linear(1, variance=variances[3, 4], active_dims=[4])                   # u
                     ) * gpflow.kernels.Matern32(1, lengthscales=1, active_dims=[1])
    )

# Mean dynamics
mean_function_x = safe_learning.LinearSystem((A[[0], :], B[[0], :]), name='mean_dynamics_x')
mean_function_theta = safe_learning.LinearSystem((A[[1], :], B[[1], :]), name='mean_dynamics_theta')
mean_function_v = safe_learning.LinearSystem((A[[2], :], B[[2], :]), name='mean_dynamics_v')
mean_function_omega = safe_learning.LinearSystem((A[[3], :], B[[3], :]), name='mean_dynamics_omega')

# Define a GP model over the dynamics
gp_x = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype),    # collected input data, X
                               np.empty((0, 1), dtype=np_dtype),           # collected output data, Y
                               kernel_x,                                   # kernel function, k(x,x')
                               mean_function_x,                            # mean function, mu(x)
                               scaling=GP_SCALING)

gp_theta = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), 
                                   np.empty((0, 1), dtype=np_dtype),
                                   kernel_theta, 
                                   mean_function_theta,
                                   scaling=GP_SCALING)

gp_v = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), 
                               np.empty((0, 1), dtype=np_dtype),
                               kernel_v, 
                               mean_function_v,
                               scaling=GP_SCALING)

gp_omega = safe_learning.GPRCached(np.empty((0, full_dim), dtype=np_dtype), 
                                   np.empty((0, 1), dtype=np_dtype),
                                   kernel_omega, 
                                   mean_function_omega,
                                   scaling=GP_SCALING)

gp_x.likelihood.variance = noise_var
gp_theta.likelihood.variance = noise_var
gp_v.likelihood.variance = noise_var
gp_omega.likelihood.variance = noise_var

if TRAIN_HYPERPARAMETERS:
    # Sample state-actions (x, u) and observations from the dynamics
    N = 1e3
    states = sample_box(state_limits, N)
    actions = sample_box(action_limits, N)
    X = np.concatenate((states, actions), axis=1)
    Y = true_dynamics(states, actions).eval()

    for i, gp in (gp_x, gp_theta, gp_v, gp_omega):

        # 
        print('Original parameters:', gp, '\n')

        # Optimize the parameters of each GP via MLE
        print('Optimizing ...\n')
        gp.X = X
        gp.Y = Y[:, i].reshape((-1, 1))
        gp.optimize()

        # Reset GP data matrices for safe learning
        gp.X = np.empty((0, full_dim), dtype=np_dtype)
        gp.Y = np.empty((0, 1), dtype=np_dtype)
        gp.update_cache()

        # 
        print('New parameters:', gp, '\n')

gp_x_fun = safe_learning.GaussianProcess(gp_x, BETA)
gp_theta_fun = safe_learning.GaussianProcess(gp_theta, BETA)
gp_v_fun = safe_learning.GaussianProcess(gp_v, BETA)
gp_omega_fun = safe_learning.GaussianProcess(gp_omega, BETA)

# Stack GP functions => block-diagonal kernel matrix
dynamics = safe_learning.FunctionStack((gp_x_fun, gp_theta_fun, gp_v_fun, gp_omega_fun))

## State Discretization

In [None]:
# Number of states along each dimension
num_states = [51, ]*state_dim

# State grid
grid_limits = np.array([[-1., 1.], [-1., 1.], [-1., 1.], [-1., 1.]])
state_discretization = safe_learning.GridWorld(grid_limits, num_states)

# Discretization constant
if USE_ZERO_THRESHOLD:
    tau = 0.0
else:
    tau = np.min(state_discretization.unit_maxes)

print('Grid size: {}'.format(state_discretization.nindex))
print('Discretization constant: {}'.format(tau))

## Cost Function

In [None]:
# State cost matrix
Q = np.diag([0.1, 0.1, 1., 1.]).astype(np_dtype)

# Action cost matrix
R = np.identity(action_dim).astype(np_dtype)

# Normalize cost matrices
cost_norm = np.max([Q.max(), R.max()])
Q = Q / cost_norm
R = R / cost_norm

# Quadratic cost function
cost_function = safe_learning.QuadraticFunction(block_diag(Q, R), name='cost_function')

## Lyapunov Function

In [None]:
# Fix policy to the LQR solution for the "wrong" system
# K, P = safe_learning.utilities.dlqr(A, B, Q, R)
K, P = safe_learning.utilities.dlqr(A_true, B_true, Q, R)
policy = safe_learning.LinearSystem(-K, name='policy')

if SATURATE:
    policy = safe_learning.Saturation(policy, -1, 1)
    
# TensorFlow variables
tf_states = tf.placeholder(tf_dtype, shape=[None, state_dim], name='states')
tf_actions = policy(tf_states)

# Define the Lyapunov function corresponding to the known policy
lyapunov_function = safe_learning.QuadraticFunction(P)
grad_lyapunov_function = safe_learning.LinearSystem((2*P,))

# Lipschitz constants
L_pol = lambda s: tf.constant(np.linalg.norm(-K, 1), dtype=tf_dtype)
L_dyn = lambda s: np.linalg.norm(A_true, 1) + np.linalg.norm(B_true, 1)*L_pol(s)

if USE_LIPSCHITZ_SCALING:
    L_v = lambda s: tf.abs(grad_lyapunov_function(s))
else:
    L_v = lambda s: tf.norm(grad_lyapunov_function(s), ord=1, axis=1, keep_dims=True)

# val_func = lambda X: np.sum(X.dot(P) * X, axis=1, keepdims=True)
# np_func = lambda s: approx_local_lipschitz_grid(val_func, s, tau, 1e3)
# L_v = lambda s: tf.py_func(np_func, [s], tf_dtype, name='approx_lipschitz')
    
# Set initial safe set as a level set of the Lyapunov function
# values = session.run(lyapunov_function(tf_states), {tf_states: state_discretization.all_points})
# cutoff = 3e-3 * np.max(values)
# initial_safe_set = np.squeeze(values, axis=1) <= cutoff

# Set initial safe set as a hypercube in the state space
# safe_norm = np.array([[x_safe / x_max, theta_safe / theta_max, v_safe / v_max, omega_safe / omega_max]])
safe_norm = np.array([[0.75, 0.5, 0.2, 0.25]])
norm_states = state_discretization.all_points / safe_norm
initial_safe_set = np.all(np.logical_and(norm_states >= -1, norm_states <= 1), axis=1, keepdims=False)

# Initialize class
lyapunov = safe_learning.Lyapunov(state_discretization, lyapunov_function, dynamics, 
                                  L_dyn, L_v, tau, policy, initial_safe_set)

In [None]:
def plot_policy(lyapunov, tf_states, fixed_state, state_norm=None):
    fig, ax = plt.subplots(1, 2, figsize=(12, 5), dpi=100)
    fig.subplots_adjust(wspace=0.4, hspace=0.2)
    cmap = plt.get_cmap('viridis')
    cmap.set_under('indigo')
    cmap.set_over('gold')
    ticks = np.linspace(-1., 1., 9)
    cutoff = 1. - 1e-5
    
    fixed_state = np.asarray(fixed_state, dtype=np_dtype)
    for i in range(4):
        dist = np.square(lyapunov.discretization.discrete_points[i] - fixed_state[i])
        idx = np.argmin(dist)
        fixed_state[i] = lyapunov.discretization.discrete_points[i][idx]
    x_fix, theta_fix, v_fix, omega_fix = fixed_state
    pos_set = np.logical_and(lyapunov.discretization.all_points[:, 1] == theta_fix, 
                             lyapunov.discretization.all_points[:, 3] == omega_fix)
    vel_set = np.logical_and(lyapunov.discretization.all_points[:, 0] == x_fix,
                             lyapunov.discretization.all_points[:, 2] == v_fix)
    
    if state_norm is not None:
        x_max, theta_max, v_max, omega_max = state_norm
        scale = np.array([x_max, np.rad2deg(theta_max), 
                          v_max, np.rad2deg(omega_max)]).reshape((-1, 1))
        limits = scale * lyapunov.discretization.limits
        x_fix, theta_fix, v_fix, omega_fix = fixed_state * scale.ravel()
    else:
        limits = lyapunov.discretization.limits
    
    # Fix v and omega, plot policy over x and theta
    grid = lyapunov.discretization.all_points[pos_set, :]
    z = session.run(lyapunov.policy(tf_states), feed_dict={tf_states: grid})
    z = z.reshape(lyapunov.discretization.num_points[:2])
    im = ax[0].imshow(z.T, 
                      origin='lower', 
                      extent=limits[(0, 2), :].ravel(), 
                      aspect=limits[0, 0] / limits[2, 0],
                      cmap=cmap,
                      vmin=-cutoff,
                      vmax=cutoff)
    cbar = fig.colorbar(im, ax=ax[0], label=r'$u = \pi(x)$', ticks=ticks)
    ax[0].set_title(r'$\theta = %.3g$ deg, $\omega = %.3g$ deg/s' % (theta_fix, omega_fix))
    ax[0].set_xlabel(r'$x$ [m]')
    ax[0].set_ylabel(r'$v$ [m/s]')
  
    # Fix x and theta, plot policy over v and omega
    grid = lyapunov.discretization.all_points[vel_set, :]
    z = session.run(lyapunov.policy(tf_states), feed_dict={tf_states: grid})
    z = z.reshape(lyapunov.discretization.num_points[2:])
    im = ax[1].imshow(z.T, 
                      origin='lower', 
                      extent=limits[(1, 3), :].ravel(), 
                      aspect=limits[1, 0] / limits[3, 0],
                      cmap=cmap,
                      vmin=-cutoff,
                      vmax=cutoff)
    cbar = fig.colorbar(im, ax=ax[1], label=r'$u = \pi(x)$', ticks=ticks)
    ax[1].set_title(r'$x = %.3g$ m, $v = %.3g$ deg' % (x_fix, v_fix))
    ax[1].set_xlabel(r'$\theta$ [deg]')
    ax[1].set_ylabel(r'$\omega$ [deg/s]')

    plt.show()

# Visualize policy
fixed_state = [0., 0., 0., 0.]
plot_policy(lyapunov, tf_states, fixed_state, state_norm)

## Initial Safe Set Visualization

In [None]:
# Compare safe set before and after checking the decrease condition for the first time
c_max = lyapunov.feed_dict[lyapunov.c_max]
init_safe_set_size = np.sum(lyapunov.safe_set)

print('Before update ...')
print('c_max: {}'.format(c_max))
print('Safe set size: {}\n'.format(init_safe_set_size))

old_safe_set = np.copy(lyapunov.safe_set)
lyapunov.update_safe_set()

c_max = lyapunov.feed_dict[lyapunov.c_max]
init_safe_set_size = np.sum(lyapunov.safe_set)

print('After update ...')
print('c_max: {}'.format(c_max))
print('Safe set size: {}'.format(init_safe_set_size))

visuals(lyapunov, tf_states, state_norm, plot='cartpole')

In [None]:
debug(lyapunov, true_dynamics, tf_states, newly_safe_only=True)

## Online Learning and Exploration

In [None]:
# action_variation = np.array([-0.1, -0.01, -0.001, 0., 0.001, 0.01, 0.1], 
#                             dtype=np_dtype).reshape((-1, 1))

action_variation = np.array([0.], dtype=np_dtype).reshape((-1, 1))

with tf.name_scope('add_new_measurement'):
    full_dim = state_dim + action_dim 
    tf_max_state_action = tf.placeholder(tf_dtype, shape=[1, full_dim])
    tf_measurement = true_dynamics(tf_max_state_action)
    
def update_gp():
    """Update the GP model based on an actively selected data point."""
    
    # Get a new sample location
    max_state_action, _ = safe_learning.get_safe_sample(lyapunov,
                                                        action_variation,
                                                        action_limits,
                                                        positive=True,
                                                        num_samples=1000)
    
    # Obtain a measurement of the true dynamics
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)
    
    # Add the measurement to our GP dynamics
    lyapunov.dynamics.add_data_point(max_state_action, measurement)

In [None]:
data_per_update = 10
safe_set_updates = 1

# 60 to 80 data points

if 'level' not in globals():
    level = np.zeros(safe_set_updates + 1)
    safe_size = np.zeros(safe_set_updates + 1)
    level[0] = lyapunov.feed_dict[lyapunov.c_max]
    safe_size[0] = np.sum(lyapunov.safe_set)
    num_data = data_per_update*np.arange(safe_set_updates + 1)
    e = 0
else:
    e = len(level) - 1
    level = np.concatenate((level, np.zeros(safe_set_updates)))
    safe_size = np.concatenate((safe_size, np.zeros(safe_set_updates)))
    temp = data_per_update*np.arange(1, safe_set_updates + 1) + num_data[-1]
    num_data = np.concatenate((num_data, temp))

for i in range(safe_set_updates):
    
    print('Iteration {} with c_max: {}'.format(e + i + 1, lyapunov.feed_dict[lyapunov.c_max]))
    old_safe_set = np.copy(lyapunov.safe_set)

    for _ in range(data_per_update):
#     for _ in tqdm(range(data_per_update)):
        update_gp()

    lyapunov.update_safe_set()
    level[e + i + 1] = lyapunov.feed_dict[lyapunov.c_max]
    safe_size[e + i + 1] = np.sum(lyapunov.safe_set)
    
    current_safe_set_size = np.sum(lyapunov.safe_set)
    data = lyapunov.dynamics.functions[0].X
    print('Discretization size: {}'.format(state_discretization.all_points.shape[0]))
    print('Safe set size: {}'.format(current_safe_set_size))
    print('Growth: {}'.format(current_safe_set_size - init_safe_set_size))
    print("Data points collected: {}".format(data.shape[0]))
    print("New c_max: {}".format(lyapunov.feed_dict[lyapunov.c_max]))
    
    fixed_state=(0., 0., 0., 0.)
    visuals(lyapunov, tf_states, state_norm, plot='cartpole', fixed_state=fixed_state)

In [None]:
idx = np.argmax(num_data > 200)
idx = 0
if idx == 0:
    idx = len(num_data)
print(level, '\n')
print(safe_size)
    
fig, ax = plt.subplots(1, 2, sharex=False, figsize=(10, 3), dpi=200)
fig.subplots_adjust(wspace=0.3, hspace=0.2)

ax[0].step(num_data[:idx], level[:idx], 'o--', where='post')
ax[0].set_xlabel(r'Number of data points collected', fontsize=12)
ax[0].set_ylabel(r'$c_{max}$', fontsize=12)

ax[1].step(num_data[:idx], safe_size[:idx], 'o--', where='post')
ax[1].set_xlabel(r'Number of data points collected', fontsize=12)
ax[1].set_ylabel(r'Safe set size', fontsize=12)

plt.show()

In [None]:
np.set_printoptions(precision=4, suppress=True)
states = lyapunov.dynamics.functions[0].X[:, :4]
# scale = np.array([x_max, np.rad2deg(theta_max), v_max, np.rad2deg(omega_max)]).ravel()
# print(scale * states)
print(states)

In [None]:
fixed_state = states[0, :]
fixed_state = (0.5, 0., 0.5, 0.)
visuals(lyapunov, tf_states, state_norm, plot='cartpole', fixed_state=fixed_state)

# X = data
# print(lyapunov.dynamics.functions[0].gaussian_process.predict_f(X))
# print(gp_x.likelihood.variance.value)