# Reinforcement Learning for a Cart-Pole System

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial import ConvexHull

import time
%matplotlib inline

import safe_learning
import plotting
from utilities import sample_box, sample_ellipsoid, compute_closedloop_response, CartPole, get_max_parameter_change

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
np_dtype = safe_learning.config.np_dtype
tf_dtype = safe_learning.config.dtype

tf.reset_default_graph()
try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()

initialized = False

In [None]:
def plot_value_function(tf_learned_values, tf_true_values, n_points, fixed_state, colors=['r','b'], show=True):
    """
    """
    fig = plt.figure(figsize=(12, 5), dpi=100)
    fig.subplots_adjust(wspace=0.1, hspace=0.2)
    xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]
    
    x_fix, theta_fix, x_dot_fix, theta_dot_fix = fixed_state

    # Fix x_dot and theta_dot, plot value function over x and theta
    grid = np.column_stack((xx.ravel(), yy.ravel(), 
                            x_dot_fix*np.ones_like(xx.ravel()), theta_dot_fix*np.ones_like(yy.ravel())))
    learned_values = session.run(tf_learned_values, feed_dict={states: grid}).reshape(n_points)
    true_values = session.run(tf_true_values, feed_dict={states: grid}).reshape(n_points)
    ax = fig.add_subplot(1, 2, 1, projection='3d')
    ax.plot_surface(xx, yy, learned_values, color=colors[0], alpha=0.75)
    ax.plot_surface(xx, yy, true_values, color=colors[1], alpha=0.5)
    ax.set_title(r'Value function, $\dot{x} = %.3g,\ \dot{\theta} = %.3g$' % (x_dot_fix, theta_dot_fix), fontsize=16)
    ax.set_xlabel(r'$x$', fontsize=14)
    ax.set_ylabel(r'$\theta$', fontsize=14)
    ax.set_zlabel(r'$V(s)$', fontsize=14)

    # Fix x and theta, plot value function over x_dot and theta_dot
    grid = np.column_stack((x_fix*np.ones_like(xx.ravel()), theta_fix*np.ones_like(yy.ravel()), 
                            xx.ravel(), yy.ravel()))
    learned_values = session.run(tf_learned_values, feed_dict={states: grid}).reshape(n_points)
    true_values = session.run(tf_true_values, feed_dict={states: grid}).reshape(n_points)
    ax = fig.add_subplot(1, 2, 2, projection='3d')
    ax.plot_surface(xx, yy, learned_values, color=colors[0], alpha=0.75)
    ax.plot_surface(xx, yy, true_values, color=colors[1], alpha=0.5)
    ax.set_title(r'Value function, $x = %.3g,\ \theta = %.3g$' % (x_fix, theta_fix), fontsize=16)
    ax.set_xlabel(r'$\dot{x}$', fontsize=14)
    ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
    ax.set_zlabel(r'$V(s)$', fontsize=14)

    if show:
        plt.show()

        
def plot_policy(tf_actions, tf_true_actions, n_points, fixed_state, colors=['r','b'], show=True):
    """
    """
    fig = plt.figure(figsize=(12, 5), dpi=100)
    fig.subplots_adjust(wspace=0.1, hspace=0.2)
    xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]
    
    x_fix, theta_fix, x_dot_fix, theta_dot_fix = fixed_state

    # Fix x_dot and theta_dot, plot value function over x and theta
    grid = np.column_stack((xx.ravel(), yy.ravel(), 
                            x_dot_fix*np.ones_like(xx.ravel()), theta_dot_fix*np.ones_like(yy.ravel())))
    learned_control = session.run(tf_actions, feed_dict={states: grid}).reshape(n_points)
    true_control = session.run(tf_true_actions, feed_dict={states: grid}).reshape(n_points)
    ax = fig.add_subplot(1, 2, 1, projection='3d')
    ax.plot_surface(xx, yy, learned_control, color=colors[0], alpha=0.75)
    ax.plot_surface(xx, yy, true_control, color=colors[1], alpha=0.5)
    ax.set_title(r'Control, $\dot{x} = %.3g,\ \dot{\theta} = %.3g$' % (x_dot_fix, theta_dot_fix), fontsize=16)
    ax.set_xlabel(r'$x$', fontsize=14)
    ax.set_ylabel(r'$\theta$', fontsize=14)
    ax.set_zlabel(r'$u$', fontsize=14)
    ax.view_init(elev=20., azim=15.)

    # Fix x and theta, plot value function over x_dot and theta_dot
    grid = np.column_stack((x_fix*np.ones_like(xx.ravel()), theta_fix*np.ones_like(yy.ravel()), 
                            xx.ravel(), yy.ravel()))
    learned_control = session.run(tf_actions, feed_dict={states: grid}).reshape(n_points)
    true_control = session.run(tf_true_actions, feed_dict={states: grid}).reshape(n_points)
    ax = fig.add_subplot(1, 2, 2, projection='3d')
    ax.plot_surface(xx, yy, learned_control, color=colors[0], alpha=0.75)
    ax.plot_surface(xx, yy, true_control, color=colors[1], alpha=0.5)
    ax.set_title(r'Control, $x = %.3g,\ \theta = %.3g$' % (x_fix, theta_fix), fontsize=16)
    ax.set_xlabel(r'$\dot{x}$', fontsize=14)
    ax.set_ylabel(r'$\dot{\theta}$', fontsize=14)
    ax.set_zlabel(r'$u$', fontsize=14)
    ax.view_init(elev=20., azim=100.)

    if show:
        plt.show()


def plot_closedloop_response(dynamics, policy1, policy2, steps, dt, reference='zero', const=1.0, ic=None,
                             labels=['Policy 1','Policy 2'], colors=['r','b'], show=True):
    """
    """
    state_names = [r'$x$', r'$\theta$', r'$\dot{x}$', r'$\dot{\theta}$']
    plot_idx = (1, 2, 4, 5)
    
    state_traj1, action_traj1, t, _ = compute_closedloop_response(dynamics, policy1, steps, dt, reference, const, ic)
    state_traj2, action_traj2, _, _ = compute_closedloop_response(dynamics, policy2, steps, dt, reference, const, ic)
    
    fig = plt.figure(figsize=(10, 5), dpi=100)
    fig.subplots_adjust(wspace=0.5, hspace=0.5)
    
    if reference=='zero':
        title_string = r'Zero-Input Response'
    elif reference=='impulse':
        title_string = r'Impulse Response'
    elif reference=='step':
        title_string = r'Step Response, $r = %.1g$' % const
    
    if ic is not None:
        ic_tuple = (ic[0], ic[1], ic[2], ic[3])
        title_string = title_string + r', $s_0 = (%.1g, %.1g, %.1g, %.1g)$' % ic_tuple
    fig.suptitle(title_string, fontsize=18)         
                   
    for i in range(cart_pole.state_dim):
        ax = fig.add_subplot(2, 3, plot_idx[i])
        ax.plot(t, state_traj1[:, i], colors[0])
        ax.plot(t, state_traj2[:, i], colors[1])
        ax.set_xlabel(r'$t$', fontsize=14)
        ax.set_ylabel(state_names[i], fontsize=14)
    ax = fig.add_subplot(2, 3, 3)
    plot1 = ax.plot(t, action_traj1, color=colors[0])
    plot2 = ax.plot(t, action_traj2, color=colors[1])
    ax.set_xlabel(r'$t$', fontsize=14)
    ax.set_ylabel(r'$u$', fontsize=14)
    fig.legend((plot1[0], plot2[0]), (labels[0], labels[1]), loc=(0.75, 0.2), fontsize=14)

    if show:
        plt.show()

## True Dynamics

In [None]:
# System parameters
m = 0.1     # pendulum mass
M = 1.     # cart mass
L = 0.3     # pole length

# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# State and action normalizers
x_max = 5.
theta_max = np.deg2rad(15)
x_dot_max = 2.
theta_dot_max = np.deg2rad(15)
u_max = (m + M)*x_dot_max / (10*dt)

state_norm = (x_max, theta_max, x_dot_max, theta_dot_max)
action_norm = (u_max,)

# Define system and dynamics
cart_pole = CartPole(m, M, L, dt, [state_norm, action_norm])

state_limits = np.array([[-1., 1.]]*cart_pole.state_dim)
action_limits = np.array([[-1., 1.]]*cart_pole.action_dim)

A, B = cart_pole.linearize()   
dynamics = safe_learning.functions.LinearSystem((A, B), name='dynamics')

## Reward Function

In [None]:
# State cost matrix
Q = np.diag([0.1, 0.1, 0.5, 0.5])

# Action cost matrix
R = 0.1*np.identity(cart_pole.action_dim)

# Quadratic reward (-cost) function
reward_function = safe_learning.QuadraticFunction(block_diag(-Q, -R), name='reward_function')

## Exact Optimal Policy and Value Function for the True Dynamics

In [None]:
# Solve Lyapunov equation for the exact value function matrix P and optimal feedback law u = -K.dot(s)
K, P = safe_learning.utilities.dlqr(A, B, Q, R)
print('Optimal linear gain:\n{}\n'.format(-K))
print('Quadratic value function:\n{}'.format(P))

print(np.linalg.norm(K, 2))

# LQR policy
lqr_policy = safe_learning.functions.LinearSystem((-K,), name='LQR_policy')
lqr_policy = safe_learning.Saturation(lqr_policy, -1, 1)

# Optimal value function
lqr_value_function = safe_learning.functions.QuadraticFunction(-P, name='LQR_value_function')

# T = 2000
# x = np.zeros((T, 4))
# u = np.zeros((T, 1))
# x[0, :] = np.array([1., 1., 0., 0.]).reshape(1, -1)
# u[0, :] = x[0, :].dot(-K.T)
# for t in range(T-1):
#     x[t+1, :] = x[t, :].dot((A - B.dot(K)).T)
#     u[t+1, :] = x[t+1, :].dot(-K.T)

# state_names = [r'$x$', r'$\theta$', r'$\dot{x}$', r'$\dot{\theta}$', r'$u$']
# plt.plot(np.concatenate((x, u), axis=1))
# plt.legend(state_names)
# plt.show()

T = 1000
ic = np.array([1., 1., 0., 0.]).reshape(1, -1)
x, u, t, r = compute_closedloop_response(dynamics, lqr_policy, T, dt, 'zero', ic=ic)

names = [r'$x$', r'$\theta$', r'$\dot{x}$', r'$\dot{\theta}$', r'$u$']
plt.plot(t, np.concatenate((x, u), axis=1))
plt.legend(names)
plt.show()

## Function Approximators

In [None]:
# Scaling
max_state = np.ones(cart_pole.state_dim).reshape((1, -1))
max_action = np.ones(cart_pole.action_dim).reshape((1, -1))
r_max = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
gamma = 0.98
scaling = (1 - gamma) / r_max

print('Maximum reward (abs): {}'.format(r_max))
print('Scaling: {}'.format(scaling))


# TODO testing ****************************************#
train_policy = True

train_value_function = True

supervise = False
#******************************************************#


# Value function
if train_value_function:
    layer_dims = [64, 64, 1]
    activations = [tf.nn.relu, tf.nn.relu, None]
    value_function = safe_learning.functions.NeuralNetwork(layer_dims, activations, name='value_function')
else:
    value_function = safe_learning.functions.QuadraticFunction(-scaling*P)
    
# Policy
if train_policy:
    layer_dims = [64, 64, cart_pole.action_dim]
    activations = [tf.nn.relu, tf.nn.relu, tf.nn.tanh]
    policy = safe_learning.functions.NeuralNetwork(layer_dims, activations, scaling=1., name='policy')
else:
    policy = lqr_policy

# TensorFlow graph
states = tf.placeholder(tf_dtype, shape=[None, cart_pole.state_dim], name='states')
actions = policy(states)
rewards = reward_function(states, actions)
future_states = dynamics(states, actions)

values = value_function(states)
future_values = value_function(tf.clip_by_value(future_states, -1, 1))

# Supervised Learning on a "Wrong" Model

Create a cart-pole system with "wrong" parameters. Fit our parametric value function to the corresponding LQR solution in a supervised fashion as a starting point before tackling policy iteration for the real system.

In [None]:
# "Wrong" system
m = 0.7     # pendulum mass
M = 1.      # cart mass
L = 0.6     # pole length

wrong_cart_pole = CartPole(m, M, L, dt, [state_norm, action_norm])
A_wrong, B_wrong = wrong_cart_pole.linearize()
wrong_dynamics = safe_learning.LinearSystem((A_wrong, B_wrong), name='wrong_dynamics')
K_wrong, P_wrong = safe_learning.utilities.dlqr(A_wrong, B_wrong, Q, R)

wrong_lqr_policy = safe_learning.functions.LinearSystem(-K_wrong, name='wrong_LQR_policy')
wrong_lqr_policy = safe_learning.Saturation(wrong_lqr_policy, -1, 1)
wrong_lqr_value_function = safe_learning.functions.QuadraticFunction(-P_wrong, name='wrong_LQR_value_function')

# TODO testing
# wrong_lqr_policy = lqr_policy
# wrong_lqr_value_function = lqr_value_function

# Visualize closed-loop responses
ic = np.array([0.5, 0.2, -0.1, 0.1])
const = 0.7
steps = 2000
plot_closedloop_response(dynamics, wrong_lqr_policy, lqr_policy, steps, dt, 
                         'step', const, ic, ['Wrong','True'], ['g','b'])

In [None]:
wrong_values = wrong_lqr_value_function(states)
wrong_actions = wrong_lqr_policy(states)
supervised_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
optimizer = tf.train.GradientDescentOptimizer(supervised_learning_rate)

if supervise:
    if train_value_function:
        with tf.name_scope('supervised_value_optimization'):
            supervised_value_obj = scaling*tf.reduce_mean(tf.abs(values - wrong_values), name='objective')
            wrong_value_update = optimizer.minimize(supervised_value_obj, var_list=value_function.parameters)

    if train_policy:
        with tf.name_scope('supervised_policy_optimization'):
            supervised_policy_obj = tf.reduce_mean(tf.abs(actions - wrong_actions), name='objective')
            wrong_policy_update = optimizer.minimize(supervised_policy_obj, var_list=policy.parameters)

### Supervised Learning: Value Function

In [None]:
if train_value_function and supervise:
    # Allow this cell to be run repeatedly to continue training if desired
    if not initialized:
        session.run(tf.global_variables_initializer())
        initialized = True

    # Training hyperparameters
    max_iters = 3000
    batch_size = 1e3
    alpha = 0.5
    test_set = sample_box(state_limits, 1e5)

    # Record objective values over time
    supervised_obj_eval = np.zeros(max_iters + 1)
    supervised_obj_eval[0] = session.run(supervised_value_obj, {states: test_set})

    # For checking convergence
    converged = False
    iter_memory = 100
    tol = 1e-1
    param_changes = np.zeros(max_iters)
    old_params = session.run(value_function.parameters)

    for i in tqdm(range(max_iters)):
        batch = sample_box(state_limits, batch_size)
        session.run(wrong_value_update, feed_dict={states: batch, supervised_learning_rate: alpha})
    #     supervised_obj_eval[i+1] = session.run(supervised_value_obj, {states: test_set})

        new_params = session.run(value_function.parameters)
        param_changes[i] = get_max_parameter_change(old_params, new_params)
        old_params = new_params

        # Break if converged
#         if i >= iter_memory:
#             params_converged = np.all(param_changes[i-iter_memory+1:i+1] <= tol)
#             if params_converged:
#                 converged = True
#                 break
            
    final_iter = i+1
    if converged:
        print('Converged after {} iterations.'.format(final_iter))
    else:
        print('Did not converge!')

In [None]:
if train_value_function and supervise:
    fig = plt.figure(figsize=(10, 5), dpi=100)
    fig.subplots_adjust(wspace=0.5, hspace=0.5)

    ax = fig.add_subplot(121)
    ax.plot(supervised_obj_eval[:final_iter+1], '.-r')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Supervised Learning Objective')

    ax = fig.add_subplot(122)
    ax.plot(param_changes[:final_iter+1], '.-r')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max. Value Param. Change')

    plt.show()


    #
    fixed_state = [0., 0., 0., 0.]
    plot_value_function(values, wrong_values, [51, 51], fixed_state, ['r', 'g'])

### Supervised Learning: Policy

In [None]:
if train_policy and supervise:
    # Allow this cell to be run repeatedly to continue training if desired
    if not initialized:
        session.run(tf.global_variables_initializer())
        initialized = True

    # Training hyperparameters
    max_iters = 500
    batch_size = 1e3
    alpha = 0.07
    test_set = sample_box(state_limits, 1e5)

    # Record objective values over time
    supervised_obj_eval = np.zeros(max_iters + 1)
    supervised_obj_eval[0] = session.run(supervised_policy_obj, {states: test_set})

    # For checking convergence
    converged = False
    iter_memory = 50
    tol = 1e-3
    param_changes = np.zeros(max_iters)
    old_params = session.run(policy.parameters)

    for i in tqdm(range(max_iters)):
        batch = sample_box(state_limits, batch_size)
        session.run(wrong_policy_update, feed_dict={states: batch, supervised_learning_rate: alpha})
#         supervised_obj_eval[i+1] = session.run(supervised_policy_obj, {states: test_set})

        # Value function parameter changes
        new_params = session.run(policy.parameters)
        param_changes[i] = get_max_parameter_change(old_params, new_params)
        old_params = new_params

        # Break if converged
#         if i >= iter_memory:
#             params_converged = np.all(param_changes[i-iter_memory+1:i+1] <= tol)
#             if params_converged:
#                 converged = True
#                 break

    final_iter = i+1
    if converged:
        print('Converged after {} iterations.'.format(final_iter))
    else:
        print('Did not converge!')

In [None]:
if train_policy and supervise:
    fig = plt.figure(figsize=(10, 5), dpi=100)
    fig.subplots_adjust(wspace=0.5, hspace=0.5)

    ax = fig.add_subplot(121)
    ax.plot(supervised_obj_eval[:final_iter+1], '.-r')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Supervised Learning Objective')

    ax = fig.add_subplot(122)
    ax.plot(param_changes[:final_iter+1], '.-r')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max. Policy Param. Change')

    plt.show()

    #
    fixed_state = [0., 0., 0., 0.]
    n_points = [51, 51]
    plot_policy(actions, wrong_actions, n_points, fixed_state, colors=['r','g'])

## Optimization Objectives

Now back to the true cart-pole system!

In [None]:
# Bellman error objective for value update
if train_value_function:
    with tf.name_scope('value_optimization'):
        value_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
        target = tf.stop_gradient(rewards + gamma*future_values, name='target')
        value_obj = scaling*tf.reduce_mean(tf.abs(values - target), name='objective')
        optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
        value_update = optimizer.minimize(value_obj, var_list=value_function.parameters)

# Pseudo-integration objective for policy update
if train_policy:
    with tf.name_scope('policy_optimization'):
        policy_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
        policy_obj = -scaling*tf.reduce_mean(rewards + gamma*future_values, name='objective')
        optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
        policy_update = optimizer.minimize(policy_obj, var_list=policy.parameters)

## Training: Policy Iteration

In [None]:
# Allow this cell to be run repeatedly to continue training if desired
if not initialized:
    session.run(tf.global_variables_initializer())
    initialized = True

# Training hyperparameters
max_iters = 5
value_iters = 100  # 2000
policy_iters = 10  # 500

value_alpha = 0.05 # 0.5
value_tol = 1e-1

policy_alpha = 0.01 # 0.5
policy_tol = 1e-3

batch_size = 1e3
zero_batch = np.zeros((1, cart_pole.state_dim))     # always sample zero-state
test_set = sample_box(state_limits, 1e5)

# Record objective values over time
value_obj_eval = np.zeros(max_iters + 1)
policy_obj_eval = np.zeros(max_iters + 1)
value_obj_eval[0] = session.run(value_obj, {states: test_set})
policy_obj_eval[0] = session.run(policy_obj, {states: test_set})

# For convergence, check the parameter values
converged = False
iter_memory = 5
value_param_changes = np.zeros(max_iters)
policy_param_changes = np.zeros(max_iters)
old_value_params = session.run(value_function.parameters)
old_policy_params = session.run(policy.parameters)

for i in tqdm(range(max_iters)):
    
    # Policy evaluation (value update)
    for _ in range(value_iters):
        batch = sample_box(state_limits, batch_size)
        batch = np.concatenate((batch, zero_batch), axis=0)
        session.run(value_update, feed_dict={states: batch, value_learning_rate: value_alpha})
    new_value_params = session.run(value_function.parameters)
    value_param_changes[i] = get_max_parameter_change(old_value_params, new_value_params)
    old_value_params = new_value_params

    # Policy improvement (policy update)
    for _ in range(policy_iters):
        batch = sample_box(state_limits, batch_size)
        batch = np.concatenate((batch, zero_batch), axis=0)      
        session.run(policy_update, feed_dict={states: batch, policy_learning_rate: policy_alpha})
    new_policy_params = session.run(policy.parameters)
    policy_param_changes[i] = get_max_parameter_change(old_policy_params, new_policy_params)
    old_policy_params = new_policy_params
    
#     # TODO debugging
    value_obj_eval[i+1] = session.run(value_obj, {states: test_set})
    policy_obj_eval[i+1] = session.run(policy_obj, {states: test_set})
    if np.isnan(value_obj_eval[i+1]) or np.isnan(policy_obj_eval[i+1]):
        raise ValueError('Encountered NAN value after {} iterations!'.format(i+1))
    
    # Break if converged
#     if i >= iter_memory:
#         value_params_converged = np.all(value_param_changes[i-iter_memory+1:i+1] <= value_tol)
#         policy_params_converged = np.all(policy_param_changes[i-iter_memory+1:i+1] <= policy_tol)
#         if value_params_converged and policy_params_converged:
#             converged = True
#             break

if converged:
    print('Converged after {} iterations.'.format(i+1))
else:
    print('Did not converge!')

## Training: Results

In [None]:
final_iter = i+1

fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.5, hspace=0.5)

ax = fig.add_subplot(221)
ax.plot(value_obj_eval[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Value Function Objective')

ax = fig.add_subplot(223)
ax.plot(value_param_changes[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Max. Value Param. Change')


ax = fig.add_subplot(222)
ax.plot(policy_obj_eval[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Policy Objective')

ax = fig.add_subplot(224)
ax.plot(policy_param_changes[:final_iter+1], '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Max. Policy Param. Change')

plt.show()

## Zero-Input and Step Responses

In [None]:
ic = np.array([0.1, 0.2, 0.1, 0.1])
const = 0.1
steps = 3000

# Zero-input response
plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'zero', ic=None, labels=['Learned','True'])

# Step response
plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'step', const=const, labels=['Learned','True'])

# Impulse response
# plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'impulse', labels=['Learned','True'])

## Value Function and Policy Comparison

In [None]:
fixed_state = [0., 0., 0., 0.]
n_points = [51, 51]

true_values = lqr_value_function(states)
true_actions = lqr_policy(states)

plot_value_function(values, true_values, n_points, fixed_state, colors=['r','b'])
plot_policy(actions, true_actions, n_points, fixed_state, colors=['r','b'])

## TensorBoard Graph

In [None]:
# plotting.show_graph(tf.get_default_graph())