# Reinforcement Learning for a Cart-Pole System

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

import safe_learning
from utilities import uniform_state_sampler, compute_closedloop_response, CartPole, get_max_parameter_change

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
np_dtype = safe_learning.config.np_dtype
tf_dtype = safe_learning.config.dtype

try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()
initialized = False

## Dynamics

In [None]:
# System parameters
m = 0.1     # pendulum mass
M = 5.      # cart mass
L = 0.5     # pole length
dt = 0.01   # sampling time
g = 9.81    # gravity

# State and action normalizers
x_max = 10
theta_max = np.deg2rad(10)
x_dot_max = 5
theta_dot_max = np.deg2rad(5)
u_max = 5*g*m*theta_max

state_norm = (x_max, theta_max, x_dot_max, theta_dot_max)
input_norm = (u_max,)

# Define system and dynamics
cart_pole = CartPole(m, M, L, dt, [state_norm, input_norm])
state_space = [[-1, 1]]*cart_pole.state_dim

Ad, Bd = cart_pole.linearize()   
dynamics = safe_learning.functions.LinearSystem((Ad, Bd), name='dynamics')

print(Ad)
print(Bd)

## Reward Function

In [None]:
# State cost matrix
Q = np.identity(cart_pole.state_dim)

# Action cost matrix
R = np.identity(cart_pole.input_dim)

# Quadratic reward (-cost) function
reward_function = safe_learning.QuadraticFunction(block_diag(-Q, -R), name='reward_function')

## Exact Optimal Policy and Value Function for the True Dynamics

In [None]:
# Solve Lyapunov equation for the exact value function matrix P and optimal feedback law u = -K.dot(s)
K, P = safe_learning.utilities.dlqr(Ad, Bd, Q, R)
print('Optimal linear gain:\n{}\n'.format(-K))
print('Quadratic value function:\n{}'.format(P))

# LQR policy
lqr_policy = safe_learning.functions.LinearSystem(-K, name='LQR_policy')

# Optimal value function
lqr_value_function = safe_learning.functions.QuadraticFunction(-P, name='LQR_value_function')

## Function Approximators

In [None]:
# Value function
layer_dims = [64, 64, 1]
activations = [tf.nn.relu, tf.nn.relu, None]
value_function = safe_learning.functions.NeuralNetwork(layer_dims, activations, name='value_function')

# Policy
layer_dims = [64, 64, 1]
activations = [tf.nn.relu, tf.nn.relu, tf.nn.tanh]
policy = safe_learning.functions.NeuralNetwork(layer_dims, activations, scaling=10., name='policy')

# For testing
# value_function = lqr_value_function
policy = lqr_policy

# TensorFlow graph
states = tf.placeholder(tf_dtype, shape=[None, cart_pole.state_dim], name='states')
actions = policy(states)
rewards = reward_function(states, actions)
values = value_function(states)
future_values = value_function(dynamics(states, actions))

## Supervised Learning on a "Wrong" Model

Create a cart-pole system with "wrong" parameters. Fit our parametric value function to the corresponding LQR solution in a supervised fashion as a starting point before tackling policy iteration for the real system.

In [None]:
# "Wrong" system
m = 0.2     # pendulum mass
M = 4.5     # cart mass
L = 0.5     # pole length
wrong_cart_pole = CartPole(m, M, L, dt, [state_norm, input_norm])
Ad_wrong, Bd_wrong = wrong_cart_pole.linearize()
wrong_dynamics = safe_learning.LinearSystem((Ad_wrong, Bd_wrong), name='wrong_dynamics')
K_wrong, P_wrong = safe_learning.utilities.dlqr(Ad_wrong, Bd_wrong, Q, R)

# wrong_lqr_policy = safe_learning.functions.LinearSystem(-K, name='wrong_LQR_policy')
wrong_lqr_value_function = safe_learning.functions.QuadraticFunction(-P, name='wrong_LQR_value_function')

with tf.name_scope('supervised_learning'):
    supervised_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
    wrong_values = wrong_lqr_value_function(states)
    
    scaling = np.abs(1 / session.run(wrong_values, {states: 0.1*np.ones((1, wrong_cart_pole.state_dim))}))
    scaling = 1.
    
    supervised_obj = scaling*tf.reduce_mean(tf.abs(values - wrong_values), name='objective')
    optimizer = tf.train.GradientDescentOptimizer(supervised_learning_rate)
    wrong_value_update = optimizer.minimize(supervised_obj, var_list=value_function.parameters)
    
print(scaling)

In [None]:
# Allow this cell to be run repeatedly to continue training if desired
if not initialized:
    session.run(tf.global_variables_initializer())
    initialized = True
    
# Training hyperparameters
max_iters = 100
batch_size = 1e3
alpha = 1e-4
test_set = uniform_state_sampler(1e5, state_space)

# Record objective values over time
supervised_obj_eval = np.zeros(max_iters + 1)
supervised_obj_eval[0] = session.run(supervised_obj, {states: test_set})

# TODO convergence check
converged = False
tol = 1e-2

for i in tqdm(range(max_iters)):
    batch = uniform_state_sampler(batch_size, state_space)
    session.run(wrong_value_update, feed_dict={states: batch, supervised_learning_rate: alpha})
    supervised_obj_eval[i+1] = session.run(supervised_obj, {states: test_set})

final_iter = i+1
if converged:
    print('Converged after {} iterations.'.format(final_iter))
else:
    print('Did not converge!')
    
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.5, hspace=0.5)

ax = fig.add_subplot(121)
ax.plot(supervised_obj_eval[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Supervised Learning Objective')

plt.show()

In [None]:
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.1, hspace=0.2)
n_points = [51, 51]
xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]

#
grid = np.column_stack((xx.ravel(), yy.ravel(), np.zeros_like(xx.ravel()), np.zeros_like(xx.ravel())))
true_values = session.run(wrong_values, feed_dict={states: grid}).reshape(n_points)
learned_values = session.run(values, feed_dict={states: grid}).reshape(n_points)

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(xx, yy, true_values, color='red', alpha=0.75)
ax.plot_surface(xx, yy, learned_values, color='blue', alpha=0.75)
ax.set_title(r'Value function, $\dot{x} = \dot{\theta} = 0$', fontsize=18)
ax.set_xlabel(r'$x$', fontsize=14)
ax.set_ylabel(r'$\theta$', fontsize=14)
ax.set_zlabel(r'$V(s)$', fontsize=14)

#
grid = np.column_stack((np.zeros_like(xx.ravel()), np.zeros_like(xx.ravel()), xx.ravel(), yy.ravel()))
true_values = session.run(wrong_values, feed_dict={states: grid}).reshape(n_points)
learned_values = session.run(values, feed_dict={states: grid}).reshape(n_points)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(xx, yy, true_values, color='red', alpha=0.75)
ax.plot_surface(xx, yy, learned_values, color='blue', alpha=0.75)
ax.set_title(r'Value function, $x = \theta = 0$', fontsize=18)
ax.set_xlabel(r'$\dot{x}$', fontsize=14)
ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
ax.set_zlabel(r'$V(s)$', fontsize=14)

lipschitz = session.run(value_function.lipschitz())
print(lipschitz)
print(session.run(values, {states: np.array([-1., -1., -1., -1.]).reshape(1, -1)}))

## Optimization Objectives

Now back to the true cart-pole system!

In [None]:
# Use the state and input spaces to compute the maximum absolute reward
max_state = np.ones(cart_pole.state_dim).reshape((1, -1))
input_multiplier = 1.
max_input = input_multiplier*np.ones(cart_pole.input_dim).reshape((1, -1))

gamma = 1.
r_max = max_state.dot(Q).dot(max_state.T) + max_input.dot(R).dot(max_input.T)
scaling = 1.

print('Maximum reward (abs): {}'.format(r_max))
print('Scaling: {}'.format(scaling))

# Bellman error objective for value update
with tf.name_scope('value_optimization'):
    value_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
    target = tf.stop_gradient(rewards + gamma*future_values, name='target')
    value_obj = scaling*tf.reduce_mean(tf.abs(values - target), name='objective')
    optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
    value_update = optimizer.minimize(value_obj, var_list=value_function.parameters)

# Pseudo-integration objective for policy update
with tf.name_scope('policy_optimization'):
    policy_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
    policy_obj = -scaling*tf.reduce_mean(rewards + gamma*future_values, name='objective')
    optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
#     policy_update = optimizer.minimize(policy_obj, var_list=policy.parameters)

## Training: Policy Iteration

In [None]:
# Allow this cell to be run repeatedly to continue training if desired
if not initialized:
    session.run(tf.global_variables_initializer())
    initialized = True

# Training hyperparameters
max_iters = 50
batch_size = 1e3
test_set = uniform_state_sampler(1e5, state_space)

value_iters = 1
value_alpha = 1e-3
value_tol = 1e-1

policy_iters = 50
policy_alpha = 1e-1
policy_tol = 1e-1

# Record objective values over time
value_obj_eval = np.zeros(max_iters + 1)
value_obj_eval[0] = session.run(value_obj, {states: test_set})

policy_obj_eval = np.zeros(max_iters + 1)
policy_obj_eval[0] = session.run(policy_obj, {states: test_set})

# For convergence, check the parameter values
converged = False
tol = 1e-2
value_param_changes = np.zeros(max_iters)
policy_param_changes = np.zeros(max_iters)
old_value_params = session.run(value_function.parameters)
# old_policy_params = session.run(policy.parameters)

for i in tqdm(range(max_iters)):
    # Value update
    for _ in range(value_iters):
        batch = uniform_state_sampler(batch_size, state_space)
        session.run(value_update, feed_dict={states: batch, value_learning_rate: value_alpha})

    # Policy update
#     for _ in range(policy_iters):
#         batch = uniform_state_sampler(batch_size, state_space)
#         session.run(policy_update, feed_dict={states: batch, policy_learning_rate: policy_alpha})
        
    value_obj_eval[i+1] = session.run(value_obj, {states: test_set})
    policy_obj_eval[i+1] = session.run(policy_obj, {states: test_set})
    
    # To save time during debugging ...
    if np.isnan(value_obj_eval[i+1]) or np.isnan(policy_obj_eval[i+1]):
        raise ValueError('Encountered NAN value after {} iterations!'.format(i+1))
    
    # Value function parameter changes
    new_value_params = session.run(value_function.parameters)
    value_param_changes[i] = get_max_parameter_change(old_value_params, new_value_params)
    old_value_params = new_value_params
    
    # Policy parameter changes
#     new_policy_params = session.run(policy.parameters)
#     policy_param_changes[i] = get_max_parameter_change(old_policy_params, new_policy_params)
#     old_policy_params = new_policy_params
    
    # Break if converged
#     if value_param_changes[i] <= value_tol and policy_param_changes[i] <= policy_tol:
#         converged = True
#         break

final_iter = i+1
if converged:
    print('Converged after {} iterations.'.format(final_iter))
else:
    print('Did not converge!')

## Training: Results

In [None]:
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.5, hspace=0.5)

ax = fig.add_subplot(221)
ax.plot(value_obj_eval[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Value Function Objective')

# ax = fig.add_subplot(222)
# ax.plot(policy_obj_eval[:final_iter+1], '.-r')
# ax.set_xlabel('Iteration')
# ax.set_ylabel('Policy Objective')

ax = fig.add_subplot(223)
ax.plot(value_param_changes[:final_iter+1], '.-b')
ax.set_xlabel('Iteration')
ax.set_ylabel('Max. Value Param. Change')

# ax = fig.add_subplot(224)
# ax.plot(policy_param_changes[:final_iter+1], '.-b')
# ax.set_xlabel('Iteration')
# ax.set_ylabel('Max. Policy Param. Change')

plt.show()

## Learned Policy: Zero-Input and Step Response

In [None]:
state_names = [r'$x$', r'$\theta$', r'$\dot{x}$', r'$\dot{\theta}$']
plot_idx = (1, 2, 4, 5)
steps = 2000

# Zero-input response to initial angle offset
initial_condition = np.array([0.9, 0.3, -0.1, 0.1])
true_state_traj_zero, true_input_traj_zero = compute_closedloop_response(dynamics, lqr_policy, initial_condition, 
                                                                         steps, 0)
learned_state_traj_zero, learned_input_traj_zero = compute_closedloop_response(dynamics, policy, initial_condition, 
                                                                               steps, 0)
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.suptitle('Zero-Input Response', fontsize=18)
fig.subplots_adjust(wspace=0.5, hspace=0.5)
for i in range(cart_pole.state_dim):
    ax = fig.add_subplot(2, 3, plot_idx[i])
    ax.plot(learned_state_traj_zero[:, i], 'b')
    ax.plot(true_state_traj_zero[:, i], 'r')
    ax.set_xlabel(r'$t$', fontsize=14)
    ax.set_ylabel(state_names[i], fontsize=14)
ax = fig.add_subplot(2, 3, 3)
learned = ax.plot(learned_input_traj_zero, 'b')
true = ax.plot(true_input_traj_zero, 'r')
ax.set_xlabel(r'$t$', fontsize=14)
ax.set_ylabel(r'$u$', fontsize=14)
fig.legend((learned[0], true[0]), ('Learned', 'True'), loc=(0.75, 0.2), fontsize=14)


# Step response, zero initial condition
initial_condition = np.array([0., 0., 0., 0.])
r = 1
learned_state_traj_step, learned_input_traj_step = compute_closedloop_response(dynamics, policy, initial_condition,
                                                                               steps, r)
true_state_traj_step, true_input_traj_step = compute_closedloop_response(dynamics, lqr_policy, initial_condition,
                                                                         steps, r)
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.5, hspace=0.5)
fig.suptitle('Step Response', fontsize=18)
for i in range(cart_pole.state_dim):
    ax = fig.add_subplot(2, 3, plot_idx[i])
    ax.plot(learned_state_traj_step[:, i], 'b')
    ax.plot(true_state_traj_step[:, i], 'r')
    ax.set_xlabel(r'$t$', fontsize=14)
    ax.set_ylabel(state_names[i], fontsize=14)
ax = fig.add_subplot(2, 3, 3)
learned = ax.plot(learned_input_traj_step, 'b')
true = ax.plot(true_input_traj_step, 'r')
ax.set_xlabel(r'$t$', fontsize=14)
ax.set_ylabel(r'$u$', fontsize=14)
fig.legend((learned[0], true[0]), ('Learned', 'True'), loc=(0.75, 0.2), fontsize=14)

plt.show()

## Value Function and Policy Comparison

In [None]:
fig = plt.figure(figsize=(10, 4), dpi=100)
fig.subplots_adjust(wspace=0.1, hspace=0.2)
n_points = [51, 51]
xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]
# scaling = r_max / (1 - gamma)
scaling = 1.

#
grid = np.column_stack((xx.ravel(), yy.ravel(), np.zeros_like(xx.ravel()), np.zeros_like(yy.ravel())))
true_values = session.run(lqr_value_function(states), feed_dict={states: grid}).reshape(n_points)
learned_values = session.run(values, feed_dict={states: grid}).reshape(n_points)

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(xx, yy, true_values, color='red', alpha=0.75)
ax.plot_surface(xx, yy, learned_values, color='blue', alpha=0.75)
ax.set_title(r'Value function, $\dot{x} = \dot{\theta} = 0$', fontsize=18)
ax.set_xlabel(r'$x$', fontsize=14)
ax.set_ylabel(r'$\theta$', fontsize=14)
ax.set_zlabel(r'$V(s)$', fontsize=14)

#
grid = np.column_stack((np.zeros_like(xx.ravel()), np.zeros_like(yy.ravel()), xx.ravel(), yy.ravel()))
true_values = session.run(lqr_value_function(states), feed_dict={states: grid}).reshape(n_points)
learned_values = session.run(values, feed_dict={states: grid}).reshape(n_points)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(xx, yy, true_values, color='red', alpha=0.75)
ax.plot_surface(xx, yy, learned_values, color='blue', alpha=0.75)
ax.set_title(r'Value function, $x = \theta = 0$', fontsize=18)
ax.set_xlabel(r'$\dot{x}$', fontsize=14)
ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
ax.set_zlabel(r'$V(s)$', fontsize=14)


# #
# true_control = session.run(lqr_policy(states), feed_dict={states: grid})
# learned_control = session.run(policy(states), feed_dict={states: grid})
# true_effort = -np.sum(true_control.dot(R)*true_control, axis=1, keepdims=True).reshape(n_points)
# learned_effort = -np.sum(learned_control.dot(R)*learned_control, axis=1, keepdims=True).reshape(n_points)

# ax = fig.add_subplot(2, 2, 3, projection='3d')
# ax.plot_surface(xx, yy, true_effort, color='red', alpha=0.75)
# ax.plot_surface(xx, yy, learned_effort, color='blue', alpha=0.75)
# ax.set_title(r'Control effort, $\dot{x} = \dot{\theta} = 0$', fontsize=18)
# ax.set_xlabel(r'$x$', fontsize=14)
# ax.set_ylabel(r'$\theta$', fontsize=14)
# ax.set_zlabel(r'$r(0,u)$', fontsize=14)





# #
# true_control = session.run(lqr_policy(states), feed_dict={states: grid})
# learned_control = session.run(policy(states), feed_dict={states: grid})
# true_effort = -np.sum(true_control.dot(R)*true_control, axis=1, keepdims=True).reshape(n_points)
# learned_effort = -np.sum(learned_control.dot(R)*learned_control, axis=1, keepdims=True).reshape(n_points)

# ax = fig.add_subplot(2, 2, 4, projection='3d')
# ax.plot_surface(xx, yy, true_effort, color='red', alpha=0.75)
# ax.plot_surface(xx, yy, learned_effort, color='blue', alpha=0.75)
# ax.set_title(r'Control effort, $x = \theta = 0$', fontsize=18)
# ax.set_xlabel(r'$\dot{x}$', fontsize=14)
# ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
# ax.set_zlabel(r'$r(0,u)$', fontsize=14)

## TensorBoard Graph

In [None]:
# plotting.show_graph(tf.get_default_graph())