In [1]:
import numpy as np
from scipy.signal import cont2discrete
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import safe_learning
import plotting

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

In [2]:
def uniform_state_sampler(n_samples, domain, scaling=None):
    """
    Samples uniformly from a box in the state space.
    """
    dim_state = len(domain)
    n_samps = int(n_samples)
    for i in range(dim_state):
        delta = domain[i][1]-domain[i][0]
        offset = domain[i][0]
        single_dim = delta*np.random.rand(n_samps, 1) + offset
        if scaling is not None:
            single_dim = single_dim / scaling[i]
        if i == 0:
            samples = single_dim
        else:
            samples = np.concatenate((samples, single_dim), axis=1)
    return samples


def get_unique_subfolder(data_dir, prefix):
    """
    Return directory string consisting of prefix appended by an underscore and
    then an integer. Ensures string is unique from other similarly prefixed
    folder names.
    """
    subfolders = [f for f in os.listdir(data_dir) if f[:len(prefix)] == prefix]
    if len(subfolders) == 0:
        unique_name = prefix + '_0'
    else:
        indices = [int(sf[len(prefix)+1:]) for sf in subfolders]
        max_index = np.max(indices)
        unique_name = prefix + '_' + str(max_index + 1)
    return unique_name

In [3]:
# %% Linearized cart-pole system

# System parameters
m = 10      # pendulum mass
M = 10      # cart mass
L = 100       # pole length
g = 9.81    # gravitational constant

# Linearized continuous-time system matrices for state vector:
# s = (q, q_dot) = (x, theta, x_dot, theta_dot)
dim_state = 4
dim_input = 1
A = np.array([[0, 0,             1, 0],
              [0, 0,             0, 1],
              [0, g*m/M,         0, 0],
              [0, g*(m+M)/(L*M), 0, 0]])
B = np.array([0, 0, 1/M, 1/(M*L)]).reshape(-1, dim_input)

# Discretized system matrices
Ts = 0.01    # sampling time
Ad, Bd, _, _, _ = cont2discrete((A, B, 0, 0), Ts, method='zoh')

# Domain and scaling
x_max = 10
theta_max = np.deg2rad(45)
x_dot_max = 5
u_max = g*m*theta_max
theta_dot_max = (g*(m+M)*theta_max + u_max)/(M*L)

domain = [[-x_max, x_max], [-theta_max, theta_max],
          [-x_dot_max, x_dot_max], [-theta_dot_max, theta_dot_max]]
state_scaling = [x_max, theta_max, x_dot_max, theta_dot_max]
action_scaling = [u_max]

Ts = np.diag(state_scaling)
Tu = np.diag(action_scaling)
Ts_inv = np.diag([1/i for i in state_scaling])
Tu_inv = np.diag([1/i for i in action_scaling])
A_scl = Ts_inv.dot(A).dot(Ts)
B_scl = Ts_inv.dot(B).dot(Tu)

In [4]:
# %% Cost matrices and exact value function

# State cost matrix
Q = 2*np.identity(dim_state)

# Input cost matrix
R = np.identity(dim_input)

# Solve Lyapunov equation for the exact value function and optimal feedback law
# u = -K.dot(x)
K, P = safe_learning.utilities.dlqr(A_scl, B_scl, Q, R)

In [5]:
# %% TensorFlow functions for the cost function and the scaled, discretized, linearized dynamics

@safe_learning.utilities.with_scope('cost_function')
def cost_function(states, actions):
    """
    """
    costs = tf.reduce_sum(tf.matmul(states, Q)*states
                          + tf.matmul(actions, R)*actions,
                          axis=1,
                          keep_dims=True)
    return costs


@safe_learning.utilities.with_scope('dynamics')
def dynamics(states, actions):
    """
    """
    future_states = tf.matmul(states, A_scl.T) + tf.matmul(actions, B_scl.T)
    return future_states

In [6]:
# %% Function approximators

# Value function
layer_dims = [64, 64, 1]
activations = [tf.nn.relu, tf.nn.relu, None]
value_function = safe_learning.functions.NeuralNetwork(layer_dims,
                                                       activations,
                                                       name='value_function')

# Policy
layer_dims = [1]
activations = [None]
# initializer = tf.constant_initializer(-K.T)
initializer = tf.contrib.layers.xavier_initializer()
policy = safe_learning.functions.NeuralNetwork(layer_dims,
                                               activations,
                                               name='policy')

In [7]:
# %% States, actions, stage costs, and values

tf.reset_default_graph()
dtype = safe_learning.config.dtype

states = tf.placeholder(dtype, shape=[None, dim_state], name='states')
actions = policy(states)

costs = cost_function(states, actions)
future_states = dynamics(states, actions)

values = value_function(states)
future_values = value_function(future_states)

In [8]:
# %% Optimization
gamma = 0.98
value_learning_rate = 1e-5
policy_learning_rate = 1e-3

# Bellman error objective for value update
with tf.name_scope('value_optimization'):
    target = tf.stop_gradient(costs + gamma*future_values, name='target')
    value_obj = tf.reduce_mean(tf.square(values - target), name='objective')
    value_summary = tf.summary.scalar('value_objective',
                                      value_obj)
    optimizer = tf.train.GradientDescentOptimizer(value_learning_rate)
    value_update = optimizer.minimize(value_obj,
                                      var_list=value_function.parameters)

# Pseudo-integration objective for policy update
with tf.name_scope('policy_optimization'):
    policy_obj = tf.reduce_mean(costs + gamma*future_values, name='objective')
    policy_summary = tf.summary.scalar('policy_objective',
                                       policy_obj)
    optimizer = tf.train.GradientDescentOptimizer(policy_learning_rate)
    policy_update = optimizer.minimize(policy_obj,
                                       var_list=policy.parameters)

In [11]:
# %% TensorFlow session
try:
    session.close()
except NameError:
    pass
session = tf.InteractiveSession()

# log_dir = '/data/cart_pole'
# run_dir = '/' + get_unique_subfolder(os.getcwd() + log_dir, 'run')
# path = os.getcwd() + log_dir + run_dir
# writer = tf.summary.FileWriter(path, session.graph)
session.run(tf.global_variables_initializer())

max_iters = 25
batch_size = 1e3
test_set = uniform_state_sampler(1e5, domain, state_scaling)

value_iters = 200
value_tol = 1e-1

policy_iters = 50
policy_tol = 1e-1

summary_period = 1

converged = False
old_value_obj_val = session.run(value_obj, {states: test_set})
old_policy_obj_val = session.run(policy_obj, {states: test_set})

for i in tqdm(range(max_iters)):

    for _ in range(value_iters):
        batch = uniform_state_sampler(batch_size, domain, state_scaling)
        session.run(value_update, feed_dict={states: batch})

    for _ in range(policy_iters):
        batch = uniform_state_sampler(batch_size, domain, state_scaling)
        session.run(policy_update, feed_dict={states: batch})

    # Get new objective values
    value_obj_val = session.run(value_obj, {states: test_set})
    policy_obj_val = session.run(policy_obj, {states: test_set})

    if i % summary_period == 0:
        # Record data
        value_rec, policy_rec = session.run([value_summary, policy_summary],
                                            {states: test_set})
#         writer.add_summary(value_rec, i)
#         writer.add_summary(policy_rec, i)

    # Throw error if NAN to save time during tuning
    if np.isnan(value_obj_val) or np.isnan(policy_obj_val):
#         writer.close()
        raise ValueError('Encountered NAN value on iteration {}!'.format(i))

    # Compute objective changes
    value_change = value_obj_val - old_value_obj_val
    policy_change = policy_obj_val - old_policy_obj_val

    # Stop conditions
    minimal_change = ((np.abs(value_change) <= value_tol) and
                      (np.abs(policy_change) <= policy_tol))
    decrease = (value_change <= 0) and (policy_change <= 0)

    # Break if converged
    if minimal_change:
        converged = True
        break
    else:
        old_value_obj_val = value_obj_val
        old_policy_obj_val = policy_obj_val

if converged:
    print('\nConverged after {} iterations.'.format(i + 1))
else:
    print('\nDid not converge!')
print('value objective: {} \npolicy objective: {}'.format(value_obj_val,
      policy_obj_val))

# writer.close()


  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:01<00:27,  1.14s/it][A
  8%|▊         | 2/25 [00:02<00:25,  1.12s/it][A
 12%|█▏        | 3/25 [00:03<00:24,  1.12s/it][A
 16%|█▌        | 4/25 [00:04<00:23,  1.12s/it][A
 20%|██        | 5/25 [00:05<00:22,  1.13s/it][A
 24%|██▍       | 6/25 [00:06<00:21,  1.14s/it][A
 28%|██▊       | 7/25 [00:07<00:20,  1.13s/it][A
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/spenrich/Tools/anaconda3/envs/thesis/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/spenrich/Tools/anaconda3/envs/thesis/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/spenrich/Tools/anaconda3/envs/thesis/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 68%|██████▊   | 17/25 [00:18<00:08,  1.12s/it]


Converged after 18 iterations.
value objective: 8.851085218344487 
policy objective: 2.904700835172338





In [12]:
plotting.show_graph(tf.get_default_graph())