# Reinforcement Learning for an Inverted Pendulum

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import time

from scipy.linalg import block_diag
from matplotlib.font_manager import FontProperties
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

%matplotlib inline

import safe_learning
import plotting
from utilities import InvertedPendulum, compute_closedloop_response, get_parameter_change

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

# TODO testing ****************************************#

class Options(object):
    def __init__(self, **kwargs):
        super(Options, self).__init__()
        self.__dict__.update(kwargs)

OPTIONS = Options(np_dtype              = safe_learning.config.np_dtype,
                  tf_dtype              = safe_learning.config.dtype,
                  use_linear_dynamics   = False,
                  saturate              = True,
                  eps                   = 1e-8,
                  dpi                   = 200,
                  fontproperties        = FontProperties(size=10),
                  save_figs             = False,
                  fig_path              = 'figures/pendulum_rl/')

HEAT_MAP = plt.get_cmap('inferno', lut=None)
HEAT_MAP.set_over('white')
HEAT_MAP.set_under('black')

LEVEL_MAP = plt.get_cmap('viridis', lut=21)
LEVEL_MAP.set_over('gold')
LEVEL_MAP.set_under('white')

BINARY_MAP = ListedColormap([(1., 1., 1., 0.), (0., 1., 0., 0.65)])

#******************************************************#


## TensorFlow Session

In [None]:
MAX_CPU_COUNT = os.cpu_count()
NUM_CORES = 8
NUM_SOCKETS = 2

os.environ["KMP_BLOCKTIME"]    = str(0)
os.environ["KMP_SETTINGS"]     = str(1)
os.environ["KMP_AFFINITY"]     = 'granularity=fine,noverbose,compact,1,0'
os.environ["OMP_NUM_THREADS"]  = str(NUM_CORES)

config = tf.ConfigProto(intra_op_parallelism_threads  = NUM_CORES,
                        inter_op_parallelism_threads  = NUM_SOCKETS,
                        allow_soft_placement          = False,
#                         log_device_placement          = True,
                        device_count                  = {'CPU': MAX_CPU_COUNT})

# TODO manually for CPU-only?
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession(config=config)

# print('Found MAX_CPU_COUNT =', MAX_CPU_COUNT)
# for dev in session.list_devices():
#     print(dev)

In [None]:
def gridify(norms, maxes=None, num_points=25):    
    norms = np.asarray(norms).ravel()
    if maxes is None:
        maxes = norms
    else:
        maxes = np.asarray(maxes).ravel()
    limits = np.column_stack((- maxes / norms, maxes / norms))
    
    if isinstance(num_points, int):
        num_points = [num_points, ] * len(norms)
    grid = safe_learning.GridWorld(limits, num_points)
    return grid


def compute_roa(grid, closed_loop_dynamics, horizon=250, tol=1e-3, equilibrium=None):
    # Forward-simulate all trajectories from initial points in the discretization
    trajectories = np.empty((grid.nindex, grid.ndim, horizon))
    for t in range(horizon):
        if t == 0:
            trajectories[:, :, t] = grid.all_points
        else:
            trajectories[:, :, t] = closed_loop_dynamics(trajectories[:, :, t - 1])
            
    if equilibrium is None:
        equilibrium = np.zeros((1, grid.ndim))
    
    # Compute an approximate ROA as all states that end up "close" to 0
    dist = np.linalg.norm(trajectories[:, :, -1] - equilibrium, ord=2, axis=1, keepdims=True)
    roa = (dist <= tol)
    return trajectories, roa.reshape(grid.num_points), dist.reshape(grid.num_points)


def estimate_cost(grid, closed_loop_dynamics, cost_function, discount, horizon=250, tol=1e-3):
    # Estimate true cost function using a finite-horizon rollout
    converged = False
    rollout = np.zeros(grid.nindex)
    current_states = grid.all_points
    for t in range(horizon):
        temp = (discount ** t) * cost_function(current_states).ravel()
        rollout += temp
        if np.max(np.abs(temp)) < tol:
            converged = True
            break
        current_states = closed_loop_dynamics(current_states)
    if converged:
        print('Cost converged after {} steps!'.format(t + 1))
    else:
        print('Cost did not converge!')
    return rollout.reshape(grid.num_points)


def plot_function_3D(tf_function_A, tf_function_B, grid, norms, colors=['r','b'], title=None, show=True, view=(None, None)):
    scaled_discrete_points = [np.rad2deg(norm) * points for norm, points in zip(norms, grid.discrete_points)]
    xx, yy = np.meshgrid(*scaled_discrete_points)
    if isinstance(tf_function_A, np.ndarray):
        f_vals_A = tf_function_A.reshape(grid.num_points)
    else:
        f_vals_A = - tf_function_A(grid.all_points).eval().reshape(grid.num_points)
        
    if isinstance(tf_function_B, np.ndarray):
        f_vals_B = tf_function_B.reshape(grid.num_points)
    else:
        f_vals_B = - tf_function_B(grid.all_points).eval().reshape(grid.num_points)
     
    fig = plt.figure(figsize=(6, 5), dpi=200)
    ax = fig.add_subplot(111, projection='3d')
    surf_B = ax.plot_surface(xx, yy, f_vals_B, color=colors[1], alpha=0.65) #, label=r'$J_{LQR}({\bf x})$')
    surf_A = ax.plot_surface(xx, yy, f_vals_A, color=colors[0], alpha=0.65) #, label=r'$J_{\bf \theta}({\bf x})$')
    
    for surf in (surf_A, surf_B):
        surf._facecolors2d = surf._facecolors3d
        surf._edgecolors2d = surf._edgecolors3d
        
    if title is not None:
        ax.set_title(title, fontproperties=OPTIONS.fontproperties)
    ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
    ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
#     ax.set_zlabel(r'$J({\bf x})$', fontproperties=OPTIONS.fontproperties)
    for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
        label.set_fontproperties(OPTIONS.fontproperties)
        
    proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65)]]    
    ax.legend(proxy, [r'$J_{\pi}({\bf x})$', r'$J_{\bf \theta}({\bf x})$'], prop=OPTIONS.fontproperties)
    
#     ax.contourf(xx, yy, roa, cmap=BINARY_MAP, zdir='z', offset=0)
#     proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65), (0., 1., 0., 0.65)]]    
#     ax.legend(proxy, [r'$J_{\pi}({\bf x})$', r'$J_{\bf \theta}({\bf x})$', 'ROA'], prop=OPTIONS.fontproperties)

    ax.view_init(view[0], view[1])
    fig.tight_layout()
    if show:
        plt.show()


def plot_closedloop_response(dynamics, policy1, policy2, steps, dt, reference='zero', const=1.0, ic=None,
                             labels=['Policy 1','Policy 2'], colors=['r','b'], show=True):
    state_dim = 2
    state_traj1, action_traj1, t, _ = compute_closedloop_response(dynamics, policy1, state_dim, steps, dt, reference, const, ic)
    state_traj2, action_traj2, _, _ = compute_closedloop_response(dynamics, policy2, state_dim, steps, dt, reference, const, ic)
    
    fig = plt.figure(figsize=(16, 3), dpi=100)
    fig.subplots_adjust(wspace=0.2, hspace=0.5)
    
    if reference=='zero':
        title_string = r'Zero-Input Response'
    elif reference=='impulse':
        title_string = r'Impulse Response'
    elif reference=='step':
        title_string = r'Step Response, $r = %.1g$' % const
    
    if ic is not None:
        ic_tuple = (ic[0], ic[1])
    else:
        ic_tuple = (0, 0)
    title_string = title_string + r', $s_0 = (%.1g, %.1g)$' % ic_tuple
    fig.suptitle(title_string, fontsize=18)         
    
    state_names = [r'$\theta$', r'$\omega$']
    plot_idx = (1, 2)
    for i in range(2):
        ax = fig.add_subplot(1, 3, plot_idx[i])
        ax.plot(t, state_traj1[:, i], colors[0])
        ax.plot(t, state_traj2[:, i], colors[1])
        ax.set_xlabel(r'$t$', fontsize=14)
        ax.set_ylabel(state_names[i], fontsize=14, rotation=0)
    ax = fig.add_subplot(1, 3, 3)
    plot1 = ax.plot(t, action_traj1, color=colors[0])
    plot2 = ax.plot(t, action_traj2, color=colors[1])
    ax.set_xlabel(r'$t$', fontsize=14)
    ax.set_ylabel(r'$u$', fontsize=14, rotation=0)
    fig.legend((plot1[0], plot2[0]), (labels[0], labels[1]), loc=(0.9, 0.2), fontsize=14)

    if show:
        plt.show()

## True Dynamics

In [None]:
# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# System parameters
m = 0.15    # pendulum mass [kg], 0.5
L = 0.5     # pole length [m], 1.
b = 0.1     # rotational friction [kg * (m ** 2) / s], 0.

# State and action normalizers
theta_max = np.deg2rad(30)
omega_max = np.sqrt(g / L)
u_max = m * g * L * np.sin(theta_max)

state_norm = (theta_max, omega_max)
action_norm = (u_max, )

# Dimensions and domains
state_dim = 2
action_dim = 1
# state_limits = np.array([[-1., 1.]] * state_dim)
# action_limits = np.array([[-1., 1.]] * action_dim)

# Define dynamic system
pendulum = InvertedPendulum(m, L, b, dt, [state_norm, action_norm])
A, B = pendulum.linearize()

if OPTIONS.use_linear_dynamics:
    dynamics = safe_learning.functions.LinearSystem((A, B), name='dynamics')
else:
    dynamics = pendulum.__call__

## Reward Function

In [None]:
# State cost matrix
Q = np.diag([0.1, 0.1])

# Action cost matrix
R = np.diag([0.1])

# Normalize cost matrices
# max_state = np.ones((1, state_dim))
# max_action = np.ones((1, action_dim))
# cost_norm = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
# # cost_norm = np.max([Q.max(), R.max()])
# Q = Q / cost_norm
# R = R / cost_norm

# Quadratic reward (-cost) function
reward_function = safe_learning.QuadraticFunction(block_diag(- Q, - R), name='reward_function')

## Parametric Policy and Value Function

In [None]:
# Policy
layer_dims = [64, 64, action_dim]
activations = [tf.nn.relu, tf.nn.relu, None]
if OPTIONS.saturate:
    activations[-1] = tf.nn.tanh
policy = safe_learning.functions.NeuralNetwork(layer_dims, activations, name='policy', use_bias=False)

# Value function
layer_dims = [64, 64, 1]
activations = [tf.nn.relu, tf.nn.relu, None]
value_function = safe_learning.functions.NeuralNetwork(layer_dims, activations, name='value_function', use_bias=True)

---
## Approximate Policy Evaluation

Fix the policy, and learn the corresponding value function.

### Fixed Policy

Set the policy to the LQR solution, possibly with saturation constraints

In [None]:
K, P = safe_learning.utilities.dlqr(A, B, Q, R)
policy_lqr = safe_learning.functions.LinearSystem((-K, ), name='policy_lqr')
if OPTIONS.saturate:
    policy_lqr = safe_learning.Saturation(policy_lqr, -1, 1)

### TensorFlow Graph and Initialization

In [None]:
# Clear graph
# tf.reset_default_graph()

# print(session is tf.get_default_session())
# print(session.graph is tf.get_default_graph())

# print(len([n.name for n in session.graph.as_graph_def().node]))
# print(len([n.name for n in tf.get_default_graph().as_graph_def().node]))

In [None]:
# Define new graph
states = tf.placeholder(OPTIONS.tf_dtype, shape=[None, state_dim], name='states')
actions = policy_lqr(states)
rewards = reward_function(states, actions)
future_states = dynamics(states, actions)

# Use the parametric value function
values = value_function(states)
future_values = value_function(future_states)

with tf.name_scope('approximate_policy_evaluation'):
    # Discount factor and scaling
    gamma = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='discount_factor')
    max_state = np.ones((1, state_dim))
    max_action = np.ones((1, action_dim))
    r_max = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
#     scaling = (1 - gamma) / r_max.ravel()
    scaling = 1 / r_max.ravel()
    
    # Objective function
    target = tf.stop_gradient(rewards + gamma * future_values, name='target')
    objective = scaling * tf.reduce_mean(tf.abs(values - target), name='objective')
    
    # Optimizer settings
    learning_rate = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='learning_rate')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
#     training_update = optimizer.minimize(objective, var_list=value_function.parameters)
    grads_and_vars = optimizer.compute_gradients(objective, value_function.parameters)
    training_update = optimizer.apply_gradients(grads_and_vars)
    
with tf.name_scope('state_sampler'):
    batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
    batch = tf.random_uniform([batch_size, state_dim], -1, 1, dtype=OPTIONS.tf_dtype, name='batch')

In [None]:
# Initialize parameters
session.run(tf.variables_initializer(value_function.parameters))
try:
    del obj
except NameError:
    pass

### Training

In [None]:
# Feed dict with hyperparameters
feed_dict = {
    states:         np.zeros((1, state_dim)), # placeholder
    gamma:          0.95,
    learning_rate:  0.005,
    batch_size:     100,   
}
max_iters = 500
test_size = 1e3

# Uniformly-distributed test set
grid_length = np.power(test_size, 1 / state_dim)        # test_size = N^d, solve for N
grid_length = int(2 * np.floor(grid_length / 2) + 1)    # round N to the nearest odd integer to include 0 in grid
state_limits = np.array([[-1., 1.]] * state_dim)        # states are normalized to [-1, 1]^d
num_points = [grid_length, ] * state_dim
test_set = safe_learning.GridWorld(state_limits, num_points).all_points

In [None]:
# Keep track of objective value and parameter changes during training
if 'obj' not in locals():
    obj = []
    param_changes = []
    
old_params = session.run(value_function.parameters)

for i in tqdm(range(max_iters)):
    feed_dict[states] = batch.eval(feed_dict)
    session.run(training_update, feed_dict)
    
    new_params = session.run(value_function.parameters)
    param_changes.append(get_parameter_change(old_params, new_params, 'inf'))
    old_params = list(new_params)
    
    feed_dict[states] = test_set
    obj.append(objective.eval(feed_dict))

### Results

In [None]:
fig = plt.figure(figsize=(5, 5), dpi=OPTIONS.dpi)
fig.subplots_adjust(wspace=0.5, hspace=0.35)

ax = fig.add_subplot(211)
ax.plot(obj, '.-r')
ax.set_xlabel(r'SGD iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$G(\mathcal{X}_v, {\bf \theta}_k)$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0, 0.025])

ax = fig.add_subplot(212)
ax.plot(param_changes, '.-r')
ax.set_xlabel(r'SGD iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$||{\bf \theta}_k - {\bf \theta}_{k-1}||_\infty$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0, 0.03])

plt.show()

if OPTIONS.save_figs:
    if OPTIONS.saturate and not OPTIONS.use_linear_dynamics:
        gamma_string = str(feed_dict[gamma])[2:]
        fig.savefig(OPTIONS.fig_path + 'pendulum_policyeval_training_gamma' + gamma_string + '.pdf', bbox_inches='tight')
    elif not OPTIONS.saturate and OPTIONS.use_linear_dynamics:
        fig.savefig(OPTIONS.fig_path + 'pendulum_policyeval_training_linear_.pdf', bbox_inches='tight')

In [None]:
print(obj[-1])
temp = session.run(grads_and_vars, feed_dict)

grad_list = []
for tup in temp:
    grad_list.append(tup[0].ravel())
#     print(tup[0].shape)

grad = np.concatenate(grad_list)
print(grad.shape)
print(np.linalg.norm(grad))
print(np.abs(grad).max())
print(np.abs(grad).max() * feed_dict[learning_rate])

### True Dynamics: Trajectories, ROA, and Value Function

In [None]:
closed_loop_dynamics = lambda x: future_states.eval({states: x})
cost_function = lambda x: - rewards.eval({states: x})
discount = feed_dict[gamma]

roa_horizon = 500
cost_horizon = 1000
tol = 1e-2
N = 101

norms = np.rad2deg(state_norm)
maxes = np.copy(norms)
grid = gridify(norms, maxes, N)

trajectories, roa, _ = compute_roa(grid, closed_loop_dynamics, roa_horizon, tol)
true_cost = estimate_cost(grid, closed_loop_dynamics, cost_function, discount, cost_horizon, tol)

scaled_discrete_points = [norm * points for norm, points in zip(norms, grid.discrete_points)]
xx, yy = np.meshgrid(*scaled_discrete_points)

### Plotting

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=OPTIONS.dpi)
plot_limits = np.column_stack((- maxes, maxes))
ax.set_aspect(maxes[0] / maxes[1])
ax.set_xlim(plot_limits[0])
ax.set_ylim(plot_limits[1])
ax.set_xlabel(r'$\phi$ [deg]', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
    
# ROA
im = ax.imshow(roa.T, origin='lower', extent=plot_limits.ravel(), aspect=maxes[0] / maxes[1], cmap=BINARY_MAP)

# Sub-sample discretization for faster and clearer plotting
N_traj = 13
skip = int(N / N_traj)
sub_idx = np.arange(grid.nindex).reshape(grid.num_points)
sub_idx = sub_idx[::skip, ::skip].ravel()
sub_trajectories = trajectories[sub_idx, :, :]

# Trajectories
for n in range(sub_trajectories.shape[0]):
    theta = sub_trajectories[n, 0, :] * norms[0]
    omega = sub_trajectories[n, 1, :] * norms[1]
    ax.plot(theta, omega, 'k--', linewidth=0.6)
sub_states = grid.all_points[sub_idx]
dx_dt = (future_states.eval({states: sub_states}) - sub_states) / dt
dx_dt = dx_dt / np.linalg.norm(dx_dt, ord=2, axis=1, keepdims=True)
ax.quiver(sub_states[:, 0] * norms[0], sub_states[:, 1] * norms[1], dx_dt[:, 0], dx_dt[:, 1], scale=None, pivot='mid', headwidth=4, headlength=8, color='k')

plt.show()

if OPTIONS.save_figs:
    fig.savefig(OPTIONS.fig_path + 'pendulum_lqr_roa.pdf', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(6, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
    
# ROA
ax.contourf(xx, yy, roa, cmap=BINARY_MAP, zdir='z', offset=0)

# Cost function
surf = ax.plot_surface(xx, yy, true_cost, color='b', alpha=0.65)
surf._facecolors2d = surf._facecolors3d
surf._edgecolors2d = surf._edgecolors3d

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (0., 1., 0., 0.65)]]    
ax.legend(proxy, [r'$J_{\pi}({\bf x})$', 'ROA'], prop=OPTIONS.fontproperties, loc=(0.85, 0.85))

ax.view_init(None, -45)
fig.tight_layout()

plt.show()

if OPTIONS.save_figs:
    if OPTIONS.saturate and not OPTIONS.use_linear_dynamics:
        gamma_string = str(feed_dict[gamma])[2:]
        fig.savefig(OPTIONS.fig_path + 'pendulum_lqr_costfunc_gamma' + gamma_string + '.pdf', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(6, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)

# ROA
# ax.contourf(xx, yy, roa, cmap=BINARY_MAP, zdir='z', offset=0)

# Cost functions
approx_cost = - values.eval({states: grid.all_points}).reshape(grid.num_points)
surf_true = ax.plot_surface(xx, yy, true_cost, color='b', alpha=0.65)
surf_approx = ax.plot_surface(xx, yy, approx_cost, color='r', alpha=0.65)
for surf in (surf_true, surf_approx):
    surf._facecolors2d = surf._facecolors3d
    surf._edgecolors2d = surf._edgecolors3d

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65)]]#, (0., 1., 0., 0.65)]]    
ax.legend(proxy, [r'$J_{\pi}({\bf x})$', r'$J_{\bf \theta}({\bf x})$'], prop=OPTIONS.fontproperties, loc=(0.85, 0.85))

ax.view_init(None, -45)
fig.tight_layout()

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyeval_costfunc_gamma' + gamma_string + '.pdf', bbox_inches='tight')

---
## Approximate Policy Improvement

Fix the value function, and learn the corresponding policy.

### Fixed Value Function

Use supervised learning to closely approximate a fixed value function with a neural network, and use it to learn the LQR policy, possibly with saturation constraints.

In [None]:
closed_loop_dynamics = lambda x: future_states.eval({states: x})
cost_function = lambda x: - rewards.eval({states: x})

discount = 0.965
cost_horizon = 1000
tol = 1e-2

true_cost = estimate_cost(grid, closed_loop_dynamics, cost_function, discount, cost_horizon, tol)

In [None]:
with tf.name_scope('supervised_value_function_learning'):
    true_values = tf.placeholder(OPTIONS.tf_dtype, shape=[None, 1], name='true_values')
    loss = tf.abs(values - true_values)
    objective = tf.reduce_mean(loss, name='objective')
    
    learning_rate = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='learning_rate')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_update = optimizer.minimize(objective, var_list=value_function.parameters)

try:
    del obj
except NameError:
    pass

In [None]:
# Feed dict with hyperparameters
feed_dict = {
    states:         np.zeros((1, state_dim)), # placeholder
    true_values:    np.zeros((1, 1)),         # placeholder
    learning_rate:  0.005
}
max_iters  = 1000
batch_size = 1000

test_set = grid.all_points
test_values = - true_cost.reshape((-1, 1))

In [None]:
if 'obj' not in locals():
    obj = []
    param_changes = []

true_cost  = true_cost.reshape((-1, 1))
old_params = session.run(value_function.parameters)

for i in tqdm(range(max_iters)):
    batch_idx = np.random.randint(grid.nindex, size=batch_size)
    feed_dict[states] = grid.all_points[batch_idx]
    feed_dict[true_values] = - true_cost[batch_idx]
    session.run(training_update, feed_dict)
    
    new_params = session.run(value_function.parameters)
    param_changes.append(get_parameter_change(old_params, new_params, 'inf'))
    old_params = list(new_params)
    
    feed_dict[states] = test_set
    feed_dict[true_values] = test_values
    obj.append(objective.eval(feed_dict))

In [None]:
true_cost = true_cost.reshape(grid.num_points)
approx_cost = - values.eval({states: grid.all_points}).reshape(grid.num_points)

fig = plt.figure(figsize=(14, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(121)
ax.plot(obj, 'r')

ax = fig.add_subplot(122, projection='3d')
ax.plot_surface(xx, yy, true_cost, color='b', alpha=0.65)
ax.plot_surface(xx, yy, approx_cost, color='r', alpha=0.65)

plt.show()

### TensorFlow Graph and Initialization

In [None]:
# Use the parametric policy
states = tf.placeholder(OPTIONS.tf_dtype, shape=[None, state_dim], name='states')
actions = policy(states)
rewards = reward_function(states, actions)
future_states = dynamics(states, actions)

# Future values according to parametric policy and true value function
future_values = value_function(future_states)

with tf.name_scope('approximate_policy_evaluation'):
    # Discount factor and scaling
    gamma = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='discount_factor')
    max_state = np.ones((1, state_dim))
    max_action = np.ones((1, action_dim))
    r_max = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
    scaling = (1 - gamma) / r_max.ravel()
    
    # Objective function    
    objective = - scaling * tf.reduce_mean(rewards + gamma * future_values, name='objective')
    
    # Optimizer settings
    learning_rate = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='learning_rate')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_update = optimizer.minimize(objective, var_list=policy.parameters)
    
with tf.name_scope('state_sampler'):
    batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
    batch = tf.random_uniform([batch_size, state_dim], -1, 1, dtype=OPTIONS.tf_dtype, name='batch')

In [None]:
# Initialize parameters
session.run(tf.variables_initializer(policy.parameters))
try:
    del obj
except NameError:
    pass

### Training

In [None]:
# Feed dict with hyperparameters
feed_dict = {
    states:         np.zeros((1, state_dim)), # placeholder
    gamma:          discount,                 # use the same discount factor from the value function!
    learning_rate:  0.6,
    batch_size:     100,   
}
max_iters = 1000
test_size = 1000

# Uniformly-distributed test set
grid_length = np.power(test_size, 1 / state_dim)        # test_size = N^d, solve for N
grid_length = int(2 * np.floor(grid_length / 2) + 1)    # round N to the nearest odd integer to include 0 in grid
state_limits = np.array([[-1., 1.]] * state_dim)        # states are normalized to [-1, 1]^d
num_points = [grid_length, ] * state_dim
test_set = safe_learning.GridWorld(state_limits, num_points).all_points

In [None]:
# Keep track of objective value and parameter changes during training
if 'obj' not in locals():
    obj = []
    param_changes = []
    
old_params = session.run(policy.parameters)

for i in tqdm(range(max_iters)):
    feed_dict[states] = batch.eval(feed_dict)
    session.run(training_update, feed_dict)
    
    new_params = session.run(policy.parameters)
    param_changes.append(get_parameter_change(old_params, new_params, 'inf'))
    old_params = list(new_params)
    
    feed_dict[states] = test_set
    obj.append(objective.eval(feed_dict))

### Results

In [None]:
fig = plt.figure(figsize=(5, 5), dpi=OPTIONS.dpi)
fig.subplots_adjust(wspace=0.5, hspace=0.35)

ax = fig.add_subplot(211)
ax.plot(obj, '.-r')
ax.set_xlabel(r'SGD iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$H(\mathcal{X}_v, {\bf \delta}_k)$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0.62, 0.635])

ax = fig.add_subplot(212)
ax.plot(param_changes, '.-r')
ax.set_xlabel(r'SGD iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$||{\bf \delta}_k - {\bf \delta}_{k-1}||_\infty$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0, 0.002])

plt.show()

if OPTIONS.save_figs:
    if OPTIONS.saturate and not OPTIONS.use_linear_dynamics:
        gamma_string = str(feed_dict[gamma])[2:]
        fig.savefig(OPTIONS.fig_path + 'pendulum_policyimpv_training_gamma' + gamma_string + '.pdf', bbox_inches='tight')
    elif not OPTIONS.saturate and OPTIONS.use_linear_dynamics:
        fig.savefig(OPTIONS.fig_path + 'pendulum_policyimpv_training_linear_.pdf', bbox_inches='tight')

### Plotting

In [None]:
fig = plt.figure(figsize=(6, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
    
# Policies
approx = u_max * actions.eval({states: grid.all_points}).reshape(grid.num_points)
truth = u_max * policy_lqr(grid.all_points).eval().reshape(grid.num_points)
for zz, c in zip([approx, truth], ['r', 'b']):
    surf = ax.plot_surface(xx, yy, zz, color=c, alpha=0.65)
    surf._facecolors2d = surf._facecolors3d
    surf._edgecolors2d = surf._edgecolors3d

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65)]]    
ax.legend(proxy, [r'$\pi({\bf x})$ [N]', r'$\pi_{\bf \delta}({\bf x})$ [N]'], prop=OPTIONS.fontproperties, loc=(0.85, 0.85))

ax.set_zlim([-0.4, 0.4])
ax.view_init(None, -45)
fig.tight_layout()

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyimpv_policy_gamma' + gamma_string + '.pdf', bbox_inches='tight')

### New ROA

In [None]:
closed_loop_dynamics = lambda x: future_states.eval({states: x})
roa_horizon = 600
tol = 0.1
new_trajectories, new_roa, new_dists = compute_roa(grid, closed_loop_dynamics, roa_horizon, tol, equilibrium=None)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=OPTIONS.dpi)
plot_limits = np.column_stack((- maxes, maxes))
ax.set_aspect(maxes[0] / maxes[1])
ax.set_xlim(plot_limits[0])
ax.set_ylim(plot_limits[1])
ax.set_xlabel(r'$\phi$ [deg]', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
    
# Compare ROAs
cmap = ListedColormap([(1., 1., 1., 0.), (0., 1., 0., 0.65), (1., 0., 0., 0.65)])
z = roa.astype(int) + new_roa.astype(int)
im = ax.imshow(z.T, origin='lower', extent=plot_limits.ravel(), aspect=maxes[0] / maxes[1], cmap=cmap)

# Sub-sample discretization for faster and clearer plotting
N_traj = 13
skip = int(N / N_traj)
sub_idx = np.arange(grid.nindex).reshape(grid.num_points)
sub_idx = sub_idx[::skip, ::skip].ravel()
sub_trajectories = new_trajectories[sub_idx, :, :]

# Trajectories
for n in range(sub_trajectories.shape[0]):
    theta = sub_trajectories[n, 0, :] * norms[0]
    omega = sub_trajectories[n, 1, :] * norms[1]
    ax.plot(theta, omega, 'k--', linewidth=0.6)
sub_states = grid.all_points[sub_idx]
dx_dt = (future_states.eval({states: sub_states}) - sub_states) / dt
dx_dt = dx_dt / np.linalg.norm(dx_dt, ord=2, axis=1, keepdims=True)
ax.quiver(sub_states[:, 0] * norms[0], sub_states[:, 1] * norms[1], dx_dt[:, 0], dx_dt[:, 1], scale=None, pivot='mid', headwidth=4, headlength=8, color='k')

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 1., 0., 0.65), (1., 0., 0., 0.65)]]    
legend = ax.legend(proxy, [r'ROA for $\pi$', r'ROA for $\pi_{\bf \delta}$'], prop=OPTIONS.fontproperties, loc='upper right')
legend.get_frame().set_alpha(1.)

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyimpv_roa_gamma' + gamma_string + '.pdf', bbox_inches='tight')

---
## Approximate Policy Iteration

Train a policy and value function in tandem with approximate policy iteration.

### TensorFlow Graph and Initialization

In [None]:
# Use parametric policy and value function
states = tf.placeholder(OPTIONS.tf_dtype, shape=[None, state_dim], name='states')
actions = policy(states)
rewards = reward_function(states, actions)
values = value_function(states)
future_states = dynamics(states, actions)
future_values = value_function(future_states)

# Compare with LQR solution, possibly with saturation constraints
actions_lqr = policy_lqr(states)
rewards_lqr = reward_function(states, actions_lqr)
future_states_lqr = dynamics(states, actions_lqr)

# Discount factor and scaling
max_state = np.ones((1, state_dim))
max_action = np.ones((1, action_dim))
r_max = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
gamma = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='discount_factor')
eval_scaling = 1 / r_max.ravel()
impv_scaling = (1 - gamma) / r_max.ravel()

# Policy evaluation
with tf.name_scope('value_optimization'):
    value_learning_rate = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='learning_rate')
    target = tf.stop_gradient(rewards + gamma * future_values, name='target')
    value_objective = eval_scaling * tf.reduce_mean(tf.abs(values - target), name='objective')
    optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
    value_update = optimizer.minimize(value_objective, var_list=value_function.parameters)

# Policy improvement
with tf.name_scope('policy_optimization'):
    policy_learning_rate = tf.placeholder(OPTIONS.tf_dtype, shape=[], name='learning_rate')
    policy_objective = - impv_scaling * tf.reduce_mean(rewards + gamma * future_values, name='objective')
    optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
    policy_update = optimizer.minimize(policy_objective, var_list=policy.parameters)

# Sampling    
with tf.name_scope('state_sampler'):
    batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
    batch = tf.random_uniform([batch_size, state_dim], -1, 1, dtype=OPTIONS.tf_dtype, name='batch')

In [None]:
# Initialize parameters
session.run(tf.variables_initializer(value_function.parameters))
session.run(tf.variables_initializer(policy.parameters))
try:
    del value_obj
    del policy_obj
except NameError:
    pass

## Training

In [None]:
# Feed dict with hyperparameters
feed_dict = {
    states:                np.zeros((1, state_dim)), # placeholder
    gamma:                 0.965,
    value_learning_rate:   0.005,           # 0.1 
    policy_learning_rate:  0.6,           # 0.6
    batch_size:            int(1e2),
}
max_iters    = 200
value_iters  = 100    # 100
policy_iters = 10    # 10
test_size    = 1e3   # 1e3

# Uniformly-distributed test set
grid_length = np.power(test_size, 1 / state_dim)        # test_size = N^d, solve for N
grid_length = int(2 * np.floor(grid_length / 2) + 1)    # round N to the nearest odd integer to include 0 in grid
state_limits = np.array([[-1., 1.]] * state_dim)        # states are normalized to [-1, 1]^d
num_points = [grid_length, ] * state_dim
test_set = safe_learning.GridWorld(state_limits, num_points).all_points

In [None]:
if 'value_obj' not in locals():
    value_obj = []
    policy_obj = []
    value_param_changes = []
    policy_param_changes = []
    
converged = False
old_value_params = session.run(value_function.parameters)
old_policy_params = session.run(policy.parameters)

for i in tqdm(range(max_iters)):
    
    # Policy evaluation (value update)
    for _ in range(value_iters):
        feed_dict[states] = batch.eval(feed_dict)
        session.run(value_update, feed_dict)
        #
    feed_dict[states] = test_set
    value_obj.append(value_objective.eval(feed_dict))
    new_value_params = session.run(value_function.parameters)
    value_param_changes.append(get_parameter_change(old_value_params, new_value_params, 'inf'))
    old_value_params = list(new_value_params)

    # Policy improvement (policy update)
    for _ in range(policy_iters):
        feed_dict[states] = batch.eval(feed_dict)
        session.run(policy_update, feed_dict)
        #
    feed_dict[states] = test_set
    policy_obj.append(policy_objective.eval(feed_dict))
    new_policy_params = session.run(policy.parameters)
    policy_param_changes.append(get_parameter_change(old_policy_params, new_policy_params, 'inf'))
    old_policy_params = list(new_policy_params)
    
    # Record objectives
#     feed_dict[states] = test_set
#     value_obj.append(value_objective.eval(feed_dict))
#     policy_obj.append(policy_objective.eval(feed_dict))

final_iter = i + 1
if converged:
    print('Converged after {} iterations.'.format(final_iter))
else:
    print('Did not converge!')

### Results

In [None]:
fig = plt.figure(figsize=(5, 5), dpi=OPTIONS.dpi)
fig.subplots_adjust(wspace=0.5, hspace=0.35)

ax = fig.add_subplot(211)
ax.plot(value_obj, '.-r')
ax.set_xlabel(r'Policy iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$G(\mathcal{X}_v, {\bf \theta}_k)$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
ax.set_ylim([0, None])

ax = fig.add_subplot(212)
ax.plot(value_param_changes, '.-r')
ax.set_xlabel(r'Policy iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$||{\bf \theta}_k - {\bf \theta}_{k-1}||_\infty$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
ax.set_ylim([0, None])

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyiter_costfunc_training_gamma' + gamma_string + '.pdf', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(5, 5), dpi=OPTIONS.dpi)
fig.subplots_adjust(wspace=0.5, hspace=0.35)

ax = fig.add_subplot(211)
ax.plot(policy_obj, '.-r')
ax.set_xlabel(r'Policy iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$H(\mathcal{X}_v, {\bf \delta}_k)$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0, 2])

ax = fig.add_subplot(212)
ax.plot(policy_param_changes, '.-r')
ax.set_xlabel(r'Policy iteration $k$', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$||{\bf \delta}_k - {\bf \delta}_{k-1}||_\infty$', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
# ax.set_ylim([0, 0.025])

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyiter_policy_training_gamma' + gamma_string + '.pdf', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(6, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)

# Costs
approx = - values.eval({states: grid.all_points}).reshape(grid.num_points)
truth = true_cost.reshape(grid.num_points)
for zz, c in zip([approx, truth], ['r', 'b']):
    surf = ax.plot_surface(xx, yy, zz, color=c, alpha=0.65)
    surf._facecolors2d = surf._facecolors3d
    surf._edgecolors2d = surf._edgecolors3d
    

# ROA
ax.contourf(xx, yy, roa, cmap=BINARY_MAP, zdir='z', offset=0)

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65), (0., 1., 0., 0.65)]]    
ax.legend(proxy, [r'$J_{\pi}({\bf x})$', r'$J_{\bf \theta}({\bf x})$', r'ROA for $\pi$'], prop=OPTIONS.fontproperties, loc=(0.85, 0.85))

ax.view_init(None, -45)
fig.tight_layout()

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyiter_costfunc_gamma' + gamma_string + '.pdf', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(6, 5), dpi=OPTIONS.dpi)
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(r'$\phi$ [deg]', labelpad=10, fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', labelpad=10, fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)

# Policies
approx = u_max * actions.eval({states: grid.all_points}).reshape(grid.num_points)
truth = u_max * policy_lqr(grid.all_points).eval().reshape(grid.num_points)
for zz, c in zip([approx, truth], ['r', 'b']):
    surf = ax.plot_surface(xx, yy, zz, color=c, alpha=0.65)
    surf._facecolors2d = surf._facecolors3d
    surf._edgecolors2d = surf._edgecolors3d

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 0., 1., 0.65), (1., 0., 0., 0.65)]]    
ax.legend(proxy, [r'$\pi({\bf x})$ [N]', r'$\pi_{\bf \delta}({\bf x})$ [N]'], prop=OPTIONS.fontproperties, loc=(0.85, 0.85))

ax.set_zlim([-0.4, 0.4])
ax.view_init(None, -45)
fig.tight_layout()

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyiter_policy_gamma' + gamma_string + '.pdf', bbox_inches='tight')

### New ROA

In [None]:
closed_loop_dynamics = lambda x: future_states.eval({states: x})
roa_horizon = 500
tol = 0.01
new_trajectories, new_roa, new_dists = compute_roa(grid, closed_loop_dynamics, roa_horizon, tol, equilibrium=None)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=OPTIONS.dpi)
plot_limits = np.column_stack((- maxes, maxes))
ax.set_aspect(maxes[0] / maxes[1])
ax.set_xlim(plot_limits[0])
ax.set_ylim(plot_limits[1])
ax.set_xlabel(r'$\phi$ [deg]', fontproperties=OPTIONS.fontproperties)
ax.set_ylabel(r'$\dot{\phi}$ [deg/s]', fontproperties=OPTIONS.fontproperties)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontproperties(OPTIONS.fontproperties)
    
# Compare ROAs
cmap = ListedColormap([(1., 1., 1., 0.), (0., 1., 0., 0.65), (1., 0., 0., 0.65)])
z = roa.astype(int) + new_roa.astype(int)
im = ax.imshow(z.T, origin='lower', extent=plot_limits.ravel(), aspect=maxes[0] / maxes[1], cmap=cmap)

# Sub-sample discretization for faster and clearer plotting
N_traj = 13
skip = int(N / N_traj)
sub_idx = np.arange(grid.nindex).reshape(grid.num_points)
sub_idx = sub_idx[::skip, ::skip].ravel()
sub_trajectories = new_trajectories[sub_idx, :, :]

# Trajectories
for n in range(sub_trajectories.shape[0]):
    theta = sub_trajectories[n, 0, :] * norms[0]
    omega = sub_trajectories[n, 1, :] * norms[1]
    ax.plot(theta, omega, 'k--', linewidth=0.6)
sub_states = grid.all_points[sub_idx]
dx_dt = (future_states.eval({states: sub_states}) - sub_states) / dt
dx_dt = dx_dt / np.linalg.norm(dx_dt, ord=2, axis=1, keepdims=True)
ax.quiver(sub_states[:, 0] * norms[0], sub_states[:, 1] * norms[1], dx_dt[:, 0], dx_dt[:, 1], scale=None, pivot='mid', headwidth=4, headlength=8, color='k')

# Legend
proxy = [plt.Rectangle((0,0), 1, 1, fc=c) for c in [(0., 1., 0., 0.65), (1., 0., 0., 0.65)]]    
legend = ax.legend(proxy, [r'ROA for $\pi$', r'ROA for $\pi_{\bf \delta}$'], prop=OPTIONS.fontproperties, loc='upper right')
legend.get_frame().set_alpha(1.)

plt.show()

if OPTIONS.save_figs:
    gamma_string = str(feed_dict[gamma])[2:]
    fig.savefig(OPTIONS.fig_path + 'pendulum_policyiter_roa_gamma' + gamma_string + '.pdf', bbox_inches='tight')

## Zero-Input and Step Responses

In [None]:
ic = np.array([0.1, 0.2])
const = 0.4
steps = 1000

# Zero-input response
plot_closedloop_response(dynamics, policy, policy_lqr, steps, dt, 'zero', ic=ic, labels=['Learned','True'])

# Step response
plot_closedloop_response(dynamics, policy, policy_lqr, steps, dt, 'step', const=const, labels=['Learned','True'])

# Impulse response
# plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'impulse', labels=['Learned','True'])

In [None]:
# plotting.show_graph(tf.get_default_graph())