In [None]:
from __future__ import division, print_function

import GPflow
import tensorflow as tf
ScipyOptimizerInterface = tf.contrib.opt.ScipyOptimizerInterface
import numpy as np
import scipy
import cvxpy
from scipy import linalg
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.optimize import check_grad
from plotting import plot_lyapunov_1d

import safe_learning

## Specify RL problem

In [None]:
domain = [-1, 1]
n_points = 101


gamma = .98

a = np.array([[1.2]])
b = np.array([[1.]])
q = np.array([[1.]])
r = np.array([[1.]])

dynamics = safe_learning.LinearSystem(a, b)

discretization = safe_learning.GridWorld(domain, n_points)
state_space = discretization.all_points


# Create the optimal value function
k_opt, s = safe_learning.utilities.dlqr(a, b, q, r)

value_function = safe_learning.Triangulation(discretization, s * state_space ** 2, project=True)


reward = safe_learning.QuadraticFunction(linalg.block_diag(q, r))

policy_discretization = safe_learning.GridWorld([-1, 1], 15)
policy = safe_learning.Triangulation(policy_discretization, policy_discretization.all_points * 0.)


rl = safe_learning.PolicyIteration(
    state_space,
    policy,
    dynamics,
    reward,
    function_approximator=value_function,
    gamma=gamma)

try:
    session.close()
except:
    pass
session = tf.Session()
session.run(tf.global_variables_initializer())

## Plot value function (optimize if wanted)

In [None]:
with session.as_default():    
    plt.plot(rl.state_space, rl.value_function.parameters.eval())
    plt.plot(rl.state_space, s * rl.state_space ** 2)
    plt.show()
    plt.plot(rl.state_space, rl.policy(rl.state_space).eval())
    plt.show()
        

## Create optimization problem

In [None]:
loss = tf.reduce_sum(rl.future_values(rl.state_space))
gradient = tf.gradients(loss, rl.policy.parameters)
gradient = tf.convert_to_tensor(gradient[0])

input_inequality = tf.reduce_min(0.3 - tf.abs(rl.policy(rl.state_space)))

In [None]:
optimizer = ScipyOptimizerInterface(loss,
                                    var_list=[rl.policy.parameters],
                                    inequalities=[input_inequality],
                                    method='SLSQP')

optimizer.minimize(session, feed_dict=rl.dynamics.feed_dict)

In [None]:
with session.as_default():
    initial_param = rl.policy.parameters.initial_value.eval()
    plt.plot(policy_discretization.all_points, initial_param, label='init')
    plt.plot(state_space, -k_opt * state_space, label='opt', lw=5, alpha=0.5)
    plt.plot(state_space, rl.policy(state_space).eval(), label='new')
    plt.legend()