In [None]:
from __future__ import division, print_function

import GPy
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.optimize import check_grad
# Try to import safe_rl from system
# if it fails get it from the main folder directly instead.
import utilities
from plotting import plot_lyapunov_1d

# If library not installed, import it from '../'
safe_learning = utilities.import_from_directory('safe_learning', '../')

In [None]:
# TODO: multi-dim state-space?
value_function = safe_learning.Triangulation([0, 1], 2, vertex_values=[0, -1])
policy = safe_learning.Triangulation([0, 1], 2, vertex_values=[1, 3])

gp = GPy.models.GPRegression(np.array([[-0.1, -0.1], [0.9, 0.9]]),
                             np.array([[1], [2]]),
                             noise_var=0.01**2)
dynamics = safe_learning.GPyGaussianProcess(gp, beta=2.)

test = np.array([0.1, 0.1])

In [None]:
import scipy
from functools import partial
x_test = np.linspace(-1, 1, 100)[:, None]

def fun(param, x):
    x = np.atleast_2d(x)
    policy.parameters = param
    return np.sum(value_function(dynamics(x, policy(x))[0]))
    
def gradient(param, x):
    x = np.atleast_2d(x)
    policy.parameters = param
    
    u = policy(x)
    
    v_grad = value_function.gradient(dynamics(x, u)[0])
    d_grad = dynamics.gradient(x, u)[0][:, 1:]
    p_grad = policy.parameter_derivative(x).toarray()
    
    gradient = (v_grad * d_grad) * p_grad 
    return np.sum(gradient, axis=0)


check_grad(fun,
           gradient,
           [0.5, 1], x_test)
# gradient([0.5, 1], x_test)

In [None]:
domain = [-1, 1]
n_points = 50
value_function = safe_learning.Triangulation(domain, n_points, project=True)

state_space = value_function.all_points
action_space = np.linspace(-1, 1, 11)[:, None]

gamma = .98

a = 1.2
b = 1.
q = 1.
r = 1.

def dynamics(states, actions):
    """Return future states of the car"""
    states = np.atleast_2d(states)
    actions = np.atleast_2d(actions)
    next_states = a * states + b * actions    
    return next_states

def dynamics_gradient(states, actions):
    return np.hstack((a * np.ones_like(states, dtype=np.float),
                      b * np.ones_like(actions, dtype=np.float)))

def reward(states, actions, next_states):
    states = np.atleast_2d(states)
    actions = np.atleast_2d(actions)
    return q * states**2 + r * actions**2

def reward_gradient(states, actions):
    states = np.atleast_2d(states)
    actions = np.atleast_2d(actions)
    return 2 * np.hstack((q * states, r * actions))
    
reward_function = safe_learning.DeterministicFunction.from_callable(reward, reward_gradient)

dynamics_fun = safe_learning.DeterministicFunction.from_callable(dynamics, dynamics_gradient)

In [None]:
dynamics_fun.gradient([[1],[1]], [[2],[2]])

In [None]:
rl = safe_learning.PolicyIteration(
    state_space,
    action_space,
    dynamics_fun,
    reward_function,
    function_approximator=value_function,
    gamma=gamma)


policy = safe_learning.Triangulation([-1, 1], 5)

initial_param = 0 * policy.all_points
policy.parameters = initial_param


k, s = safe_learning.utilities.dlqr(a, b, q, r)

rl.value_function.parameters = s * rl.state_space ** 2

print(k, -a*b/(r + b**2) * s)

In [None]:
plt.plot(rl.state_space, rl.values)   
plt.show()
plt.plot(rl.state_space, policy(rl.state_space))

In [None]:
def fun(param, x):
    x = np.atleast_2d(x)
    policy = safe_learning.Triangulation([-1, 1], 5)
    policy.parameters = param
    u = policy(x)
    
    reward = rl.reward_function(x, u, x)
    res = np.sum(reward + rl.value_function(rl.dynamics(x, u)))
    return res
    
def gradient(param, x):
    x = np.atleast_2d(x)
    policy = safe_learning.Triangulation([-1, 1], 5)
    policy.parameters = param
    
    u = policy(x)
    
    v_grad = rl.value_function.gradient(rl.dynamics(x, u))
    d_grad = rl.dynamics.gradient(x, u)[:, 1:]
    p_grad = policy.parameter_derivative(x).toarray()
    
    r_grad = rl.reward_function.gradient(x, u)[:, 1:]
        
    gradient = (r_grad + v_grad * d_grad) * p_grad
    res = np.sum(gradient, axis=0)
    return res

from scipy.optimize import check_grad, minimize

# check_grad(fun, gradient, -0.5 * policy.all_points, state_space)

# gradient(-0.5 * policy.all_points, state_space)

In [None]:
res = minimize(fun, policy.parameters.copy(), args=(state_space), jac=gradient, method='BFGS')
print(res)
policy.parameters = res.x

In [None]:
plt.plot(policy.all_points, initial_param, label='init')
plt.plot(state_space, -k * state_space, label='opt', lw=5, alpha=0.5)
plt.plot(state_space, policy(state_space), label='new')
plt.legend()