# Reinforcement Learning for an Inverted Pendulum

In [None]:
from __future__ import division, print_function

import numpy as np
import tensorflow as tf
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import time
%matplotlib inline

import safe_learning
import plotting
from utilities import (sample_box, sample_box_boundary, sample_ellipsoid, constrained_batch_sampler, 
                       InvertedPendulum, compute_closedloop_response, get_max_parameter_change)

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x
    
np_dtype = safe_learning.config.np_dtype
tf_dtype = safe_learning.config.dtype

tf.reset_default_graph()
try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()

initialized = False


# TODO testing ****************************************#

train_policy = True

train_value_function = True

clip_states = False

use_rollout_target = False

saturate = True

#******************************************************#


In [None]:
class RBFNetwork(safe_learning.DeterministicFunction):
    def __init__(self, limits, num_states, variances=None, name='rbf_network'):
        super(RBFNetwork, self).__init__(name=name)
        self.discretization = safe_learning.GridWorld(np.asarray(limits), num_states)
        self._hidden_units = self.discretization.nindex
        if variances is not None:
            self.variances = variances
        else:
            self.variances = np.min(self.discretization.unit_maxes)**2    # tau**2
        self._betas = 1 / (2*self.variances)
        self.centres = self.discretization.all_points
        self._centres_3D = np.reshape(self.centres.T, (1, state_dim, self._hidden_units))
    
    def build_evaluation(self, states):
        initializer=tf.contrib.layers.xavier_initializer()
#         initializer=tf.constant_initializer()
        W = tf.get_variable('weights', dtype=tf_dtype, shape=[self._hidden_units, 1], initializer=initializer)
        states_3D = tf.expand_dims(states, axis=2)
        phi_X = tf.exp(-self._betas*tf.reduce_sum(tf.square(states_3D - self._centres_3D), axis=1, keep_dims=False))
        output = tf.matmul(phi_X, W)
        return output


def tf_rollout(initial_states, tf_dynamics, tf_policy, tf_reward_function, tf_horizon, gamma=1.0):
    
    if isinstance(gamma, tf.Tensor):
        tf_gamma = tf.cast(gamma, tf_dtype)
    else:
        tf_gamma = tf.constant(gamma, dtype=tf_dtype)
    
    def body(rollout, states, idx):
        actions = tf_policy(states)
        rollout = rollout + tf.pow(tf_gamma, idx)*tf_reward_function(states, actions)
        states = tf_dynamics(states, actions)
        idx = idx + 1
        return rollout, states, idx

    def condition(rollout, states, idx):
        return idx < tf_horizon

    initial_idx = tf.constant(0, dtype=tf_dtype)
    initial_rollout = tf.constant(0, shape=[1, 1], dtype=tf_dtype) # broadcasting takes care of N states
    
    shape_invariants = [tf.TensorShape([None, 1]), initial_states.get_shape(), initial_idx.get_shape()]
    rollout, final_states, _ = tf.while_loop(condition, body, [initial_rollout, initial_states, initial_idx], 
                                             shape_invariants)
    return rollout, final_states


def plot_function_3D(tf_function_A, tf_function_B, n_points, colors=['r','b'], title='Value function', show=True):
    xx, yy = np.mgrid[-1:1:np.complex(0, n_points[0]), -1:1:np.complex(0, n_points[1])]
    grid = np.column_stack((xx.ravel(), yy.ravel()))
    tf_grid = tf.constant(grid, dtype=tf_dtype)
    f_vals_A = tf_function_A(tf_grid).eval().reshape(n_points)
    f_vals_B = tf_function_B(tf_grid).eval().reshape(n_points)
                           
    fig = plt.figure(figsize=(6, 5), dpi=100)
    fig.subplots_adjust(wspace=0.1, hspace=0.2)
    
    ax = fig.add_subplot(111, projection='3d')
    ax.plot_surface(xx, yy, f_vals_A, color=colors[0], alpha=0.5)
    ax.plot_surface(xx, yy, f_vals_B, color=colors[1], alpha=0.5)
    
    if isinstance(tf_function_A, RBFNetwork):
        centre_vals = tf_function_A(tf_function_A.centres).eval()
        ax.scatter(tf_function_A.centres[:, 0], tf_function_A.centres[:, 1], centre_vals, color=colors[0], marker='o')
    if isinstance(tf_function_B, RBFNetwork):
        centre_vals = tf_function_B(tf_function_B.centres).eval()
        ax.scatter(tf_function_B.centres[:, 0], tf_function_B.centres[:, 1], centre_vals, color=colors[1], marker='o')
    
    ax.set_title(title, fontsize=16)
    ax.set_xlabel(r'$x$', fontsize=14)
    ax.set_ylabel(r'$\theta$', fontsize=14)

    if show:
        plt.show()


def plot_closedloop_response(dynamics, policy1, policy2, steps, dt, reference='zero', const=1.0, ic=None,
                             labels=['Policy 1','Policy 2'], colors=['r','b'], show=True):
    state_dim = 2
    state_traj1, action_traj1, t, _ = compute_closedloop_response(dynamics, policy1, state_dim, 
                                                                  steps, dt, reference, const, ic)
    state_traj2, action_traj2, _, _ = compute_closedloop_response(dynamics, policy2, state_dim,
                                                                  steps, dt, reference, const, ic)
    
    fig = plt.figure(figsize=(10, 3), dpi=100)
    fig.subplots_adjust(wspace=0.5, hspace=0.5)
    
    if reference=='zero':
        title_string = r'Zero-Input Response'
    elif reference=='impulse':
        title_string = r'Impulse Response'
    elif reference=='step':
        title_string = r'Step Response, $r = %.1g$' % const
    
    if ic is not None:
        ic_tuple = (ic[0], ic[1])
    else:
        ic_tuple = (0, 0)
    title_string = title_string + r', $s_0 = (%.1g, %.1g)$' % ic_tuple
    fig.suptitle(title_string, fontsize=18)         
    
    state_names = [r'$\theta$', r'$\omega$']
    plot_idx = (1, 2)
    for i in range(2):
        ax = fig.add_subplot(1, 3, plot_idx[i])
        ax.plot(t, state_traj1[:, i], colors[0])
        ax.plot(t, state_traj2[:, i], colors[1])
        ax.set_xlabel(r'$t$', fontsize=14)
        ax.set_ylabel(state_names[i], fontsize=14, rotation=0)
    ax = fig.add_subplot(1, 3, 3)
    plot1 = ax.plot(t, action_traj1, color=colors[0])
    plot2 = ax.plot(t, action_traj2, color=colors[1])
    ax.set_xlabel(r'$t$', fontsize=14)
    ax.set_ylabel(r'$u$', fontsize=14, rotation=0)
    fig.legend((plot1[0], plot2[0]), (labels[0], labels[1]), loc=(0.85, 0.2), fontsize=14)

    if show:
        plt.show()

## True Dynamics

In [None]:
# System parameters
m = 0.5       # pendulum mass [kg]
L = 1.        # rod length [m]
b = 0.        # rotational friction constant [kg.m**2/s]

# Constants
dt = 0.01   # sampling time
g = 9.81    # gravity

# Linearized dynamics
dim_state = 2
dim_input = 1
A = np.array([[0, 1], [g/L, -b/(m*L**2)]], dtype=np_dtype)
B = np.array([[0], [1/(m*L**2)]])

# State and action normalizers
theta_max = np.deg2rad(30)
omega_max = np.sqrt(g/L)
u_max = 1.5*m*g*L*np.sin(theta_max)

state_norm = np.array([theta_max, omega_max])
action_norm = np.array([u_max])

# Define system and dynamics
pendulum = InvertedPendulum(m, L, b, dt, [state_norm, action_norm])
state_dim = 2
action_dim = 1

state_limits = np.array([[-1., 1.]]*state_dim)
action_limits = np.array([[-1., 1.]]*action_dim)

A, B = pendulum.linearize()   
dynamics = safe_learning.functions.LinearSystem((A, B), name='true_dynamics')

## Reward Function

In [None]:
# State cost matrix
Q = np.diag([0.1, 0.1])

# Action cost matrix
R = np.diag([0.1])

# Quadratic reward (-cost) function
reward_function = safe_learning.QuadraticFunction(block_diag(-Q, -R), name='reward_function')

## Exact Optimal Policy and Value Function for the True Dynamics

In [None]:
# Solve Lyapunov equation for the exact value function matrix P and optimal feedback law u = -K.dot(s)
K, P = safe_learning.utilities.dlqr(A, B, Q, R)
print('LQR gain:\n{}\n'.format(-K))
print('Induced 2-norm of LQR gain:\n{}\n'.format(np.linalg.norm(K, 2)))
print('LQR cost matrix:\n{}\n'.format(P))

# LQR policy
lqr_policy = safe_learning.functions.LinearSystem((-K,), name='LQR_policy')
if saturate:
    lqr_policy = safe_learning.Saturation(lqr_policy, -1, 1)

# Optimal value function
lqr_value_function = safe_learning.functions.QuadraticFunction(-P, name='LQR_value_function')

# Approximate maximum ellipsoidal level set contained inside [-1, 1]**d via sampling
samples = sample_box_boundary(state_limits, 1e6)
test_values = lqr_value_function(tf.constant(samples, tf_dtype)).eval()
c_min = np.amin(np.abs(test_values))
c_max = np.amax(np.abs(test_values))
print('Minimum LQR cost (approx.):\n{}\n'.format(c_min))
print('Maximum LQR cost (approx.):\n{}\n'.format(c_max))

# LQR response
T = 100
ic = np.array([1., 0.]).reshape(1, -1)
x, u, t, r = compute_closedloop_response(dynamics, lqr_policy, state_dim, T, dt, 'zero', ic=ic)

names = [r'$\theta$', r'$\omega$', r'$u$']
plt.plot(t, np.concatenate((x, u), axis=1))
plt.legend(names)
plt.show()

## Function Approximators

In [None]:
# Scaling
max_state = np.ones(state_dim).reshape((1, -1))
max_action = np.ones(action_dim).reshape((1, -1))
r_max = np.linalg.multi_dot((max_state, Q, max_state.T)) + np.linalg.multi_dot((max_action, R, max_action.T))
gamma = tf.placeholder(tf_dtype, shape=[], name='discount_factor')
scaling = (1 - gamma) / r_max

# Value function
if train_value_function:
    layer_dims = [64, 64, 1]
    activations = [tf.nn.relu, tf.nn.relu, None]
    value_function = safe_learning.functions.NeuralNetwork(layer_dims, activations, name='value_function')
else:
    value_function = lqr_value_function
    
# Policy
if train_policy:
    layer_dims = [64, 64, action_dim]
    activations = [tf.nn.relu, tf.nn.relu, None]
    if saturate:
        activations[-1] = tf.nn.tanh
    policy = safe_learning.functions.NeuralNetwork(layer_dims, activations, scaling=1., name='policy')
else:
    policy = lqr_policy

## TensorFlow Graph

In [None]:
states = tf.placeholder(tf_dtype, shape=[None, state_dim], name='states')
actions = policy(states)
rewards = reward_function(states, actions)
future_states = dynamics(states, actions)

values = value_function(states)
if clip_states:
    future_values = value_function(tf.clip_by_value(future_states, -1, 1))
else:
    future_values = value_function(future_states)

true_values = lqr_value_function(states)
true_actions = lqr_policy(states)
true_rewards = reward_function(states, true_actions)
true_future_states = dynamics(states, true_actions)

# Rollout
horizon = tf.placeholder(tf_dtype, shape=[], name='rollout_horizon')
rollout_op = tf_rollout(states, dynamics, lqr_policy, reward_function, horizon)

session.run(tf.global_variables_initializer())
N = 100
T = 100
iters = 200
initial_states = sample_box(state_limits, 1)
initial_states = np.asarray([1., -1.]).reshape(1, -1)

# TensorFlow implementation
rollout_values = np.zeros((N, T))
final_states = initial_states
for i in range(T):
    temp, final_states = session.run(rollout_op, {states: final_states, horizon: 1})
    rollout_values[:, i] = temp.ravel()
rollout_values = np.cumsum(rollout_values, axis=1)

true = true_values.eval({states: initial_states})
diff = np.abs(rollout_values[:, -1] - true)
print(np.amax(diff))

fig = plt.figure()    
ax = fig.add_subplot(111)
ax.plot(rollout_values[0, :])
ax.plot(true[0]*np.ones(T))
plt.show()

## Optimization Objectives

In [None]:
# Bellman error or rollout objective for value update
with tf.name_scope('value_optimization'):
    value_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')

    if use_rollout_target:
        rollout_horizon = tf.placeholder(tf_dtype, shape=[], name='rollout_horizon')
        target = tf_rollout(states, dynamics, policy, reward_function, rollout_horizon, gamma)[0]
    else:
        target = tf.stop_gradient(rewards + gamma*future_values, name='target')

    value_obj = scaling*tf.reduce_mean(tf.abs(values - target), name='objective')
    optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
    if train_value_function:
        value_update = optimizer.minimize(value_obj, var_list=value_function.parameters)

# Pseudo-integration objective for policy update
with tf.name_scope('policy_optimization'):
    policy_learning_rate = tf.placeholder(tf_dtype, shape=[], name='learning_rate')
    policy_obj = -scaling*tf.reduce_mean(rewards + gamma*future_values, name='objective')
    optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
    if train_policy:
        policy_update = optimizer.minimize(policy_obj, var_list=policy.parameters)

## Training: Policy Iteration

In [None]:
# Allow this cell to be run repeatedly to continue training if desired
if not initialized:
    session.run(tf.global_variables_initializer())
    initialized = True
if 'value_obj_history' not in locals():
    value_obj_history = np.zeros(0)
    value_param_history = np.zeros(0)
    policy_obj_history = np.zeros(0)
    policy_param_history = np.zeros(0)
    
# Uniformly distributed test set
test_size = 1e3
grid_length = np.power(test_size, 1 / state_dim)
grid_length = int(2*np.floor(grid_length / 2) + 1)
test_set = safe_learning.GridWorld(state_limits, [grid_length,]*state_dim).all_points

# Training hyperparameters
max_iters = 200
min_iters = 100
level = c_max
batch_size = 1e2
batch = constrained_batch_sampler(dynamics, policy, state_dim, batch_size, action_limit=0.99, zero_pad=0)

value_iters = 100 # 1
policy_iters = 10 # 10

value_tol = 2e-3
policy_tol = 1e-3

feed_dict = {
    states:               test_set,
    gamma:                0.98,
    value_learning_rate:  0.01, # 0.1
    policy_learning_rate: 0.5,  # 0.1
}
if use_rollout_target:
    feed_dict[rollout_horizon] = 25

# Record objective values over time
value_obj_eval = np.zeros(max_iters + 1)
policy_obj_eval = np.zeros(max_iters + 1)
value_obj_eval[0] = value_obj.eval(feed_dict)
policy_obj_eval[0] = policy_obj.eval(feed_dict)

# For convergence, check the parameter values
converged = False
value_param_changes = np.zeros(max_iters)
policy_param_changes = np.zeros(max_iters)
old_value_params = session.run(value_function.parameters)
old_policy_params = session.run(policy.parameters)

iter_memory = 5
col1 = np.arange(iter_memory) + 1
col2 = np.ones(iter_memory)
A = np.column_stack((col1, col2))
lstsq_slope = lambda y: np.linalg.lstsq(A, y)[0][0]

for i in tqdm(range(max_iters)):
    
    # Policy evaluation (value update)
    if train_value_function:
        for _ in range(value_iters):
#             feed_dict[states] = sample_box(state_limits, batch_size)
#             feed_dict[states] = sample_ellipsoid(P, level, batch_size)
            feed_dict[states] = session.run(batch)
            session.run(value_update, feed_dict)
        new_value_params = session.run(value_function.parameters)
        value_param_changes[i] = get_max_parameter_change(old_value_params, new_value_params)
        old_value_params = new_value_params

    # Policy improvement (policy update)
    if train_policy:
        for _ in range(policy_iters):
#             feed_dict[states] = sample_box(state_limits, batch_size)
#             feed_dict[states] = sample_ellipsoid(P, level, batch_size)
            feed_dict[states] = session.run(batch)
            session.run(policy_update, feed_dict)
        new_policy_params = session.run(policy.parameters)
        policy_param_changes[i] = get_max_parameter_change(old_policy_params, new_policy_params)
        old_policy_params = new_policy_params
    
    feed_dict[states] = test_set
    value_obj_eval[i+1] = value_obj.eval(feed_dict)
    policy_obj_eval[i+1] = policy_obj.eval(feed_dict)
    
    # TODO debugging    
    if np.isnan(value_obj_eval[i+1]) or np.isnan(policy_obj_eval[i+1]):
        raise ValueError('Encountered NAN value after {} iterations!'.format(i+1))
    
    # Break if converged
#     if i >= iter_memory and i >= min_iters:
#         value_params_converged = np.all(value_param_changes[i-iter_memory+1:i+1] <= value_tol)
#         policy_params_converged = np.all(policy_param_changes[i-iter_memory+1:i+1] <= policy_tol)
#         if value_params_converged and policy_params_converged:
#             converged = True
#             break

#     value_obj_converged = lstsq_slope(value_obj_eval[i-iter_memory+1:i+1]) <= value_tol
#     policy_obj_converged = lstsq_slope(policy_obj_eval[i-iter_memory+1:i+1]) <= policy_tol
#     if i > min_iters and value_obj_converged and policy_obj_converged:
#         converged = True
#         break
        

final_iter = i+1
if converged:
    print('Converged after {} iterations.'.format(final_iter))
else:
    print('Did not converge!')

value_obj_history = np.concatenate((value_obj_history, value_obj_eval[:final_iter+1]))
value_param_history = np.concatenate((value_param_history, value_param_changes[:final_iter+1]))
policy_obj_history = np.concatenate((policy_obj_history, policy_obj_eval[:final_iter+1]))
policy_param_history = np.concatenate((policy_param_history, policy_param_changes[:final_iter+1]))

## Training: Results

In [None]:
fig = plt.figure(figsize=(10, 5), dpi=100)
fig.subplots_adjust(wspace=0.5, hspace=0.5)

cap = 20
start = 0
end = -1

ax = fig.add_subplot(221)
ax.plot(np.clip(value_obj_history[start:end], None, cap), '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Value Function Objective')

ax = fig.add_subplot(223)
ax.plot(np.clip(value_param_history[start:end], None, cap), '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Max. Value Param. Change')


ax = fig.add_subplot(222)
ax.plot(np.clip(policy_obj_history[start:end], None, cap), '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Policy Objective')

ax = fig.add_subplot(224)
ax.plot(np.clip(policy_param_history[start:end], None, cap), '.-r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Max. Policy Param. Change')

plt.show()

## Zero-Input and Step Responses

In [None]:
ic = np.array([0.1, 0.2])
const = 0.4
steps = 1000

# Zero-input response
plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'zero', ic=ic, labels=['Learned','True'])

# Step response
plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'step', const=const, labels=['Learned','True'])

# Impulse response
# plot_closedloop_response(dynamics, policy, lqr_policy, steps, dt, 'impulse', labels=['Learned','True'])

## Value Function and Policy Comparison

In [None]:
if not initialized:
    session.run(tf.global_variables_initializer())
    initialized = True
    time.sleep(1.5)
    print('Initialized!')
    
n_points = [51, 51]
plot_function_3D(value_function, lqr_value_function, n_points, title='Value function')
plot_function_3D(policy, lqr_policy, n_points, title='Control')