# Cart-Pole Example

In [None]:
import numpy as np
from scipy.signal import cont2discrete
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import safe_learning
import plotting

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
from tqdm import tnrange, tqdm_notebook

In [None]:
def uniform_state_sampler(n_samples, state_space, scale=False):
    """
    Samples uniformly from a box in the state space.
    """
    dim_state = len(state_space)
    n_samps = int(n_samples)
    for i in range(dim_state):
        delta = state_space[i][1] - state_space[i][0]
        offset = state_space[i][0]
        single_dim = delta*np.random.rand(n_samps, 1) + offset
        if scale:
            single_dim = single_dim / np.max(np.abs(state_space[i]))
        if i == 0:
            samples = single_dim
        else:
            samples = np.concatenate((samples, single_dim), axis=1)
    return samples

def compute_trajectory(dynamics, policy, initial_condition, steps, input_type=None, dim_input=1):
    dim_state = len(initial_condition)
    if input_type == None:
        u = np.zeros((1, dim_input))
    elif input_type == 'step':
        u = np.ones((1, dim_input))    
    with tf.name_scope('compute_trajectory'):
        trajectory = np.zeros((steps+1, dim_state), dtype=np.float)
        trajectory = np.zeros((steps+1, dim_state), dtype=np.float)
        trajectory[0, :] = initial_condition

        current_state = tf.placeholder(dtype, shape=[1, dim_state])
        next_state = dynamics(current_state, policy(current_state) + u)

        for i in range(steps):
            trajectory[i+1, :] = session.run(next_state, feed_dict={current_state: trajectory[[i], :]})
    return trajectory


### Discrete-time Linearized Cart-Pole Dynamics

In [None]:
# System parameters
m = 10      # pendulum mass
M = 10      # cart mass
L = 100     # pole length
g = 9.81    # gravitational constant

# Linearized continuous-time system matrices for state vector:
# s = (q, q_dot) = (x, theta, x_dot, theta_dot)
dim_state = 4
dim_input = 1
A = np.array([[0, 0,             1, 0],
              [0, 0,             0, 1],
              [0, g*m/M,         0, 0],
              [0, g*(m+M)/(L*M), 0, 0]])
B = np.array([0, 0, 1/M, 1/(M*L)]).reshape((-1, dim_input))

# Discretized system matrices
dt = 0.01    # sampling time
Ad, Bd, _, _, _ = cont2discrete((A, B, 0, 0), dt, method='zoh')

# State and action spaces
x_max = 10
theta_max = np.deg2rad(45)
x_dot_max = 5
u_max = g*m*theta_max
theta_dot_max = (g*(m+M)*theta_max + u_max)/(M*L)

state_space = [[-x_max, x_max], [-theta_max, theta_max],
               [-x_dot_max, x_dot_max], [-theta_dot_max, theta_dot_max]]
input_space = [[-u_max, u_max],]

# Scale the dynamics
Ts = np.diag([np.max(np.abs(i)) for i in state_space])
Tu = np.diag([np.max(np.abs(i)) for i in input_space])
Ts_inv = np.diag([1/np.max(np.abs(i)) for i in state_space])
Tu_inv = np.diag([1/np.max(np.abs(i)) for i in input_space])
A_scl = Ts_inv.dot(Ad).dot(Ts)
B_scl = Ts_inv.dot(Bd).dot(Tu)

# TensorFlow wrapper for the dynamics
@safe_learning.utilities.with_scope('scaled_dynamics')
def scaled_dynamics(states, actions):
    future_states = tf.matmul(states, A_scl.T) + tf.matmul(actions, B_scl.T)
    return future_states

### Cost Matrices and Exact LQR Solution

In [None]:
# State cost matrix
Q = 2*np.identity(dim_state)

# Input cost matrix
R = np.identity(dim_input)

# Solve Lyapunov equation for the exact value function and optimal feedback law u = -K.dot(x)
K, P = safe_learning.utilities.dlqr(A_scl, B_scl, Q, R)

# TensorFlow wrapper for the LQR policy
@safe_learning.utilities.with_scope('LQR_policy')
def lqr_policy(states):
    actions = tf.matmul(states, -K.T)
    return actions

# Use the state and input spaces to compute the "maximum norm" reward
# max_state = np.array([np.max(np.abs(i)) for i in state_space]).reshape((1, dim_state))
# max_input = np.array([np.max(np.abs(i)) for i in input_space]).reshape((1, dim_input))
max_state = np.ones((1, dim_state))
max_input = np.ones((1, dim_input))
max_reward = max_state.dot(Q).dot(max_state.T) + max_input.dot(R).dot(max_input.T)

print('Maximum reward is: {}'.format(max_reward))

# TensorFlow wrapper for the reward function with scaling
@safe_learning.utilities.with_scope('reward_function')
def reward_function(states, actions):
    rewards = -tf.reduce_sum(tf.matmul(states, Q)*states + tf.matmul(actions, R)*actions,
                             axis=1,
                             keep_dims=True) / 1000
    return rewards

### Function Approximators

In [None]:
# Value function
layer_dims = [64, 64, 1]
activations = [tf.nn.relu, tf.nn.relu, None]
value_function = safe_learning.functions.NeuralNetwork(layer_dims,
                                                       activations,
                                                       name='value_function')

# Policy
layer_dims = [1]
activations = [None]
# initializer = tf.constant_initializer(-K.T)
initializer = tf.contrib.layers.xavier_initializer()
policy = safe_learning.functions.NeuralNetwork(layer_dims,
                                               activations,
                                               name='policy')

### States, Actions, Rewards, and Values

In [None]:
tf.reset_default_graph()
dtype = safe_learning.config.dtype

states = tf.placeholder(dtype, shape=[None, dim_state], name='states')
actions = policy(states)

rewards = reward_function(states, actions)
future_states = scaled_dynamics(states, actions)

values = value_function(states)
future_values = value_function(future_states)

### Optimization Objectives

In [None]:
gamma = 0.98
value_learning_rate = 1e-1
policy_learning_rate = 1e-1

# TODO scale objectives for value and policy updates appropriately

# Bellman error objective for value update
with tf.name_scope('value_optimization'):
    target = tf.stop_gradient(rewards + gamma*future_values, name='target')
    # Normalize by the infinite series
    value_obj = (1 - gamma)*tf.reduce_mean(tf.square(values - target), name='objective')
    value_summary = tf.summary.scalar('value_objective',
                                      value_obj)
    optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
    value_update = optimizer.minimize(value_obj,
                                      var_list=value_function.parameters)

# Pseudo-integration objective for policy update
with tf.name_scope('policy_optimization'):
    policy_obj = -tf.reduce_mean(rewards + gamma*future_values, name='objective')
    policy_summary = tf.summary.scalar('policy_objective',
                                       policy_obj)
    optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
    policy_update = optimizer.minimize(policy_obj,
                                       var_list=policy.parameters)

### Run Optimization

In [None]:
try:
    session.close()
except NameError:
    pass
session = tf.Session()
session.run(tf.global_variables_initializer())

max_iters = 100
batch_size = 1e3
test_set = uniform_state_sampler(1e5, state_space, scale=False)

value_iters = 100
value_tol = 1e-3

policy_iters = 50
policy_tol = 1e-3

temp = session.run(rewards, {states: test_set})
print(np.max(np.abs(temp)))

In [None]:
converged = False
value_obj_values = np.zeros(max_iters+1)
policy_obj_values = np.zeros(max_iters+1)
value_obj_values[0] = session.run(value_obj, {states: test_set})
policy_obj_values[0] = session.run(policy_obj, {states: test_set})

old_value_params = session.run(value_function.parameters)
old_policy_params = session.run(policy.parameters)

for i in tnrange(max_iters):

    for _ in range(value_iters):
        batch = uniform_state_sampler(batch_size, state_space, scale=False)
        session.run(value_update, feed_dict={states: batch})

    for _ in range(policy_iters):
        batch = uniform_state_sampler(batch_size, state_space, scale=False)
        session.run(policy_update, feed_dict={states: batch})

    # Get new objective values
    value_obj_values[i+1] = session.run(value_obj, {states: test_set})
    policy_obj_values[i+1] = session.run(policy_obj, {states: test_set})

    # Get new parameters
    value_params = session.run(value_function.parameters)
    policy_params = session.run(policy.parameters)

    if np.isnan(value_obj_values[i+1]) or np.isnan(policy_obj_values[i+1]):
        session.close()
        raise ValueError('Encountered NAN value on iteration {}!'.format(i))

#     # Compute objective changes
#     value_change = value_obj_values[i+1] - value_obj_values[i]
#     policy_change = policy_obj_values[i+1] - policy_obj_values[i]

#     # Stop conditions
#     minimal_change = ((np.abs(value_change) <= value_tol) and
#                       (np.abs(policy_change) <= policy_tol))
#     decrease = (value_change <= 0) and (policy_change <= 0)

    # Compute maximum parameter changes
    max_val_change = 0
    for new, old in zip(value_params, old_value_params):
        temp = np.max(np.abs(new - old))
        if temp > max_val_change:
            max_val_change = temp
    max_pol_change = 0
    for new, old in zip(policy_params, old_policy_params):
        temp = np.max(np.abs(new - old))
        if temp > max_pol_change:
            max_pol_change = temp

    # Break if converged
    if max_val_change <= value_tol and max_pol_change <= policy_tol:
        converged = True
        break

final_iter = i+1
if converged:
    print('Converged after {} iterations.'.format(final_iter))
else:
    print('Did not converge!')
print('value objective: {} \npolicy objective: {}'.format(value_obj_values[i+1], policy_obj_values[i+1]))
print('value param change: {} \npolicy param change: {}'.format(max_val_change, max_pol_change))

### Visualizations

In [None]:
fig = plt.figure()

ax = fig.add_subplot(121)
ax.plot(value_obj_values[:final_iter+1])

ax = fig.add_subplot(122)
ax.plot(policy_obj_values[:final_iter+1])

In [None]:
print('Maximum reward: {}'.format(np.max(np.abs(session.run([rewards], {states: test_set})))))
print('Maximum value: {}'.format(np.max(np.abs(session.run([values], {states: test_set})))))

In [None]:
steps = 2500
initial_condition = np.array([0., 0.01, 0., 0.]).dot(Ts_inv)

trajectory_rl = compute_trajectory(scaled_dynamics, policy, initial_condition, steps, None, dim_input)
trajectory_lqr = compute_trajectory(scaled_dynamics, lqr_policy, initial_condition, steps, None, dim_input)
    
# Return to original dynamics scale
trajectory_rl = trajectory_rl.dot(Ts)
trajectory_lqr = trajectory_lqr.dot(Ts)
    
fig = plt.figure(1, figsize=(15, 10), dpi=70)
for i in range(dim_state):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(trajectory_rl[:, i], 'r')
    ax.plot(trajectory_lqr[:, i], 'b')
plt.show()


### Step Response

In [None]:
steps = 2500
initial_condition = np.array([0., 0.01, 0., 0.]).dot(Ts_inv)

trajectory_rl = compute_trajectory(scaled_dynamics, policy, initial_condition, steps, 'step', dim_input)
trajectory_lqr = compute_trajectory(scaled_dynamics, lqr_policy, initial_condition, steps, 'step', dim_input)
    
# Return to original dynamics scale
trajectory_rl = trajectory_rl.dot(Ts)
trajectory_lqr = trajectory_lqr.dot(Ts)
    
fig = plt.figure(1, figsize=(15, 10), dpi=70)
for i in range(dim_state):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(trajectory_rl[:, i], 'r')
    ax.plot(trajectory_lqr[:, i], 'b')
plt.show()

In [None]:
# plotting.show_graph(tf.get_default_graph())