In [None]:
from __future__ import division, print_function

import GPflow
import tensorflow as tf
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
from matplotlib import colors
from scipy import linalg
%matplotlib inline
import tensorflow as tf
from scipy.optimize import minimize

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

import safe_learning
import plotting
from utilities import InvertedPendulum

try:
    session.close()
except NameError:
    pass

session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())

### Goal:

Collect data points without leaving the safe region of attraction of the current policy. At the same time, optimize the policy to ensure maximum performance.

In [None]:
n = 2
m = 1

# 'Wrong' model parameters
mass = 0.1
friction = 0.
length = 0.5
gravity = 9.81
inertia = mass * length ** 2

# True model parameters
true_mass = 0.15
true_friction = 0.1
true_length = length
true_inertia = true_mass * true_length ** 2

# Input saturation
x_max = np.deg2rad(30)
u_max = gravity * true_mass * true_length * np.sin(x_max)

norm_state = np.array([x_max, np.sqrt(gravity / length)])
norm_action = np.array([u_max])


In [None]:
true_dynamics = InvertedPendulum(mass=true_mass, length=true_length, friction=true_friction,
                                 normalization=(norm_state, norm_action))

wrong_pendulum = InvertedPendulum(mass=mass, length=length, friction=friction,
                                  normalization=(norm_state, norm_action))

# LQR cost matrices
q = 1 * np.diag([1., 2.])
r = 1.2 * np.array([[1]], dtype=safe_learning.config.np_dtype)

In [None]:
# x_min, x_max, discretization\
state_limits = np.array([[-2., 2.], [-1.5, 1.5]])
action_limits = np.array([[-1, 1]])
num_states = [2001, 1501]

safety_disc = safe_learning.GridWorld(state_limits, num_states)
policy_disc = safe_learning.GridWorld(state_limits, [55, 55])

# Discretization constant
tau = np.min(safety_disc.unit_maxes)

print('Grid size: {0}'.format(safety_disc.nindex))

### Define GP dynamics model

In [None]:
A, B = wrong_pendulum.linearize()
lipschitz_dynamics = 1

noise_var = 0.001 ** 2

m_true = np.hstack((true_dynamics.linearize()))
m = np.hstack((A, B))

variances = (m_true - m) ** 2

# Make sure things remain 
np.clip(variances, 1e-5, None, out=variances)

# Kernels
kernel1 = (GPflow.kernels.Linear(3, variance=variances[0, :], ARD=True)
           + GPflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * GPflow.kernels.Linear(1, variance=variances[0, 1]))

kernel2 = (GPflow.kernels.Linear(3, variance=variances[1, :], ARD=True)
           + GPflow.kernels.Matern32(1, lengthscales=1, active_dims=[0])
           * GPflow.kernels.Linear(1, variance=variances[1, 1]))

# Mean dynamics

mean_dynamics = safe_learning.LinearSystem((A, B), name='mean_dynamics')
mean_function1 = safe_learning.LinearSystem((A[[0], :], B[[0], :]), name='mean_dynamics_1')
mean_function2 = safe_learning.LinearSystem((A[[1], :], B[[1], :]), name='mean_dynamics_2')

# Define a GP model over the dynamics
gp1 = GPflow.gpr.GPR(np.empty((0, 3), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel1,
                    mean_function=mean_function1)
gp1.likelihood.variance = noise_var

gp2 = GPflow.gpr.GPR(np.empty((0, 3), dtype=safe_learning.config.np_dtype),
                    np.empty((0, 1), dtype=safe_learning.config.np_dtype),
                    kernel2,
                    mean_function=mean_function2)
gp2.likelihood.variance = noise_var

gp1_fun = safe_learning.GaussianProcess(gp1)
gp2_fun = safe_learning.GaussianProcess(gp2)

dynamics = safe_learning.FunctionStack((gp1_fun, gp2_fun))


# Get the optimal control gain
a_true, b_true = true_dynamics.linearize()
k_opt, s_opt = safe_learning.utilities.dlqr(a_true, b_true, q, r)
optimal_policy = safe_learning.LinearSystem((-k_opt))
optimal_policy = safe_learning.Saturation(optimal_policy, -1, 1)

### Reinforcement learning for the mean dynamics

In [None]:
# Define initial policy
relu = tf.nn.relu
policy = safe_learning.NeuralNetwork(layers=[2,4,4,1],
                                     nonlinearities=[relu, relu, None])
# Saturate policy to action limits
policy = safe_learning.Saturation(policy, action_limits[:, 0], action_limits[:, 1])

# Define value function approximation
gamma = 0.98
value_function = safe_learning.Triangulation(policy_disc,
                                             np.zeros(policy_disc.nindex, dtype=safe_learning.config.np_dtype),
                                             project=True)

# Quadratic (LQR) reward function
reward_function = safe_learning.QuadraticFunction(linalg.block_diag(-q, -r))

# Define policy optimization problem
rl = safe_learning.PolicyIteration(
    policy,
    dynamics,
    reward_function,
    value_function,
    gamma=gamma)
    
    
with tf.variable_scope('rl_mean_optimization'):
    rl_opt_value_function = rl.optimize_value_function()
    
    # Placeholder for states
    tf_states_mean = tf.placeholder(safe_learning.config.dtype, [None, 2])
    
    # Optimize for expected gain
    values = rl.future_values(tf_states_mean)
    policy_loss = -tf.reduce_mean(values)
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    adapt_policy_mean = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)
    
    
def rl_optimize_policy_mean(num_iter):
    # Optimize value function
    session.run(rl_opt_value_function, feed_dict=rl.feed_dict)

    # select random training batches
    for i in tqdm(range(num_iter)):
        rl.feed_dict[tf_states_mean] = policy_disc.sample_continuous(1000)

        session.run(adapt_policy_mean, feed_dict=rl.feed_dict)


In [None]:
for i in range(15):
    rl_optimize_policy_mean(num_iter=200)

In [None]:
def plot_policy():
    plt.figure(figsize=(10, 10))
    img = plt.imshow(rl.policy(policy_disc.all_points).eval().reshape(policy_disc.num_points).T,
                     origin='lower',
                     extent=policy_disc.limits.ravel())
    plt.colorbar(img)
    
plot_policy()

In [None]:
def plot_values():
    plt.figure(figsize=(10, 10))
    img = plt.imshow(rl.value_function.parameters[0].eval().reshape(policy_disc.num_points).T,
                     origin='lower',
                     extent=rl.value_function.discretization.limits.ravel())
    plt.colorbar(img)
plot_values()

### Define Lyapunov function

In [None]:

lyapunov_function = -rl.value_function
lipschitz_lyapunov = lambda x: tf.reduce_max(tf.abs(rl.value_function.gradient(x)),
                                             axis=1, keep_dims=True)

lipschitz_policy = lambda x: policy.fun.lipschitz 
lipschitz_dynamics = lambda x: np.max(np.abs(a_true)) + np.max(np.abs(b_true)) * lipschitz_policy(x)

lyapunov = safe_learning.Lyapunov(safety_disc,
                                  lyapunov_function,
                                  dynamics,
                                  lipschitz_dynamics,
                                  lipschitz_lyapunov,
                                  tau,
                                  policy=rl.policy,
                                  initial_set=None)

# Set initial safe set (level set)
values = safe_learning.QuadraticFunction(s_opt)(safety_disc.all_points).eval()
cutoff = np.max(values) * 0.005

lyapunov.initial_safe_set = np.squeeze(values, axis=1) <= cutoff

In [None]:
def plot_things(lyapunov, show=True):    
    plt.imshow(lyapunov.safe_set.reshape(num_states).T,
               origin='lower',
               extent=lyapunov.discretization.limits.ravel(),
               vmin=0,
               vmax=1)
    
    if isinstance(lyapunov.dynamics, safe_learning.UncertainFunction):
        X = lyapunov.dynamics.functions[0].X
        plt.plot(X[:, 0], X[:, 1], 'rx')
    
    plt.title('safe set')
    plt.colorbar()
    if show:
        plt.show()
    
lyapunov.update_safe_set()
plot_things(lyapunov)

In [None]:
init_policy = rl.policy.copy()

## Online learning
As we sample within this initial safe set, we gain more knowledge about the system. In particular, we iteratively select the state withing the safe set, $\mathcal{S}_n$, where the dynamics are the most uncertain (highest variance).

In [None]:
with tf.variable_scope('opt2'):
    
    # Placeholder for states
    tf_states = tf.placeholder(safe_learning.config.dtype, [None, 2])
    
    # Add Lyapunov uncertainty (but only if safety-relevant)
    values = rl.future_values(tf_states, lyapunov=lyapunov)
    
    policy_loss = -tf.reduce_mean(values)
    

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    adapt_policy = optimizer.minimize(policy_loss, var_list=rl.policy.parameters)
    
    
def rl_optimize_policy(num_iter):
    # Optimize value function
    session.run(rl_opt_value_function, feed_dict=rl.feed_dict)

    # select random training batches
    for i in tqdm(range(num_iter)):
        rl.feed_dict[tf_states] = lyapunov.discretization.sample_continuous(1000)

        session.run(adapt_policy, feed_dict=rl.feed_dict)

In [None]:
action_variation = np.array([[-0.02], [0.], [0.02]], dtype=safe_learning.config.np_dtype)


with tf.variable_scope('add_new_measurement'):
        action_dim = lyapunov.policy.output_dim
        tf_max_state_action = tf.placeholder(safe_learning.config.dtype,
                                             shape=[1, safety_disc.ndim + action_dim])
        tf_measurement = true_dynamics(tf_max_state_action)
        
def update_gp():
    """Update the GP model based on an actively selected data point."""
    # Get a new sample location
    max_state_action, _ = safe_learning.get_safe_sample(lyapunov,
                                                        action_variation,
                                                        action_limits,
                                                        num_samples=1000)

    # Obtain a measurement of the true dynamics
    lyapunov.feed_dict[tf_max_state_action] = max_state_action
    measurement = tf_measurement.eval(feed_dict=lyapunov.feed_dict)

    # Add the measurement to our GP dynamics
    lyapunov.dynamics.add_data_point(max_state_action, measurement)
    

In [None]:
# lyapunov.update_safe_set()
rl_optimize_policy(num_iter=200)
rl_optimize_policy(num_iter=200)

lyapunov.update_safe_set()
plot_things(lyapunov)

In [None]:
for i in range(5):
    print('iteration {} with c_max: {}'.format(i, lyapunov.feed_dict[lyapunov.c_max]))
    for i in tqdm(range(10)):
        update_gp()
    
    rl_optimize_policy(num_iter=200)
    lyapunov.update_values()
    
    # Update safe set and plot
    lyapunov.update_safe_set()
    plot_things(lyapunov)    

## Plot trajectories

In [None]:
x0 = np.array([[0.5, 0.]])

states_new, actions_new = safe_learning.utilities.compute_trajectory(true_dynamics, rl.policy, x0, 100)
states_old, actions_old = safe_learning.utilities.compute_trajectory(true_dynamics, init_policy, x0, 100)

In [None]:
plt.plot(states_new[:, 0], label='new')
plt.plot(states_old[:, 0], label='old')
plt.legend()
plt.show()

plt.plot(states_new[:, 1], label='new')
plt.plot(states_old[:, 1], label='old')
plt.legend()
plt.show()

In [None]:
plt.plot(actions_new, label='new')
plt.plot(actions_old, label='old')
plt.legend()

In [None]:
print('cost old:', tf.reduce_sum(rl.reward_function(states_old[:-1], actions_old)).eval(feed_dict=rl.feed_dict))
print('cost new:', tf.reduce_sum(rl.reward_function(states_new[:-1], actions_new)).eval(feed_dict=rl.feed_dict))