In [None]:
from __future__ import division, print_function

import tensorflow as tf
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
%matplotlib inline

from plotting import show_graph

import safe_learning

## Specify RL problem

In [None]:
domain = [-1, 1]
gamma = .98

a = 1.2
b = 1.
q = np.array([[1.]])
r = np.array([[1.]])

# Standard LQR setup
dynamics = safe_learning.LinearSystem((a, b))
reward = safe_learning.QuadraticFunction(linalg.block_diag(q, r))

# Compute optimal value function and controller
k_opt, s_opt = safe_learning.utilities.dlqr(a, b, q, r)

# Discretize the worl
discretization = safe_learning.GridWorld([-1, 1], 101)
state_space = discretization.all_points

# Specify the value function
value_function = safe_learning.Triangulation(discretization,
                                             s_opt * state_space ** 2,
                                             project=True,
                                             name='value_function')

# Specify a policy
policy_discretization = safe_learning.GridWorld(domain, 15)
policy = safe_learning.Triangulation(policy_discretization,
                                     np.zeros((policy_discretization.nindex, 1), dtype=np.float),
                                     name='policy')

rl = safe_learning.PolicyIteration(
    policy,
    dynamics,
    reward,
    value_function,
    gamma=gamma)

# Create a tensorflow session
try:
    session.close()
except NameError:
    pass
finally:
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())

## Plot value function

The value function is optimal, but the policy is all zeros. So let's optimize the value function.

In [None]:
plt.plot(state_space, rl.value_function.parameters[0].eval())
# plt.plot(state_space, s_opt * state_space ** 2)
plt.show()
plt.plot(state_space, rl.policy(rl.state_space).eval())
plt.show()
        

## Create optimization problem

In [None]:
with tf.variable_scope('optimization/objective'):
    loss = tf.reduce_sum(rl.future_values(rl.state_space), name='loss')
    input_inequality = tf.reduce_min(0.3 - tf.abs(rl.policy(rl.state_space)), name='input_inequality')

In [None]:
ScipyOptimizerInterface = tf.contrib.opt.ScipyOptimizerInterface

with tf.variable_scope('optimization/optimizer'):
    optimizer = ScipyOptimizerInterface(loss,
                                        var_list=rl.policy.parameters,
                                        inequalities=[input_inequality],
                                        method='SLSQP')

    optimizer.minimize(session, feed_dict=rl.dynamics.feed_dict)


show_graph(tf.get_default_graph())

In [None]:
initial_param = rl.policy.parameters[0].initial_value.eval()

plt.plot(policy_discretization.all_points, initial_param, label='init')
plt.plot(state_space, -k_opt * state_space, label='opt', lw=5, alpha=0.5)
plt.plot(state_space, rl.policy(state_space).eval(), label='new')
plt.legend()