In [1]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'

### Problem Setup

RL

Experiments/simulations expensive (computationally expensive or physically limited / expensive) (robot motor example)
-> Model of the physical world -> Data efficiency with experimental/simulated data

Model wants to know what parts of the world it doesn't know. -> GPs




### Define Environment

In [2]:
import gym
env = gym.make('Pendulum-v0')
action_dim = 1
obs_dim = 3

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
import numpy as np
INITIAL_GYM_STATE = [np.pi, 1.]
INITIAL_STATE = [np.cos(INITIAL_GYM_STATE[0]), np.sin(INITIAL_GYM_STATE[0]), INITIAL_GYM_STATE[1]]

### Run Environment

In [12]:
import numpy as np

def run_one_episode(env, policy, initial_state=None, max_steps=200, verbose=False, render=False):
    """
    Drives an episode of the OpenAI gym environment using the policy to decide next actions.
    """
    observation = env.reset()
    if initial_state is not None:
        env.env.state = initial_state
        observation = env.env._get_obs()
    env._max_episode_steps = max_steps
    step_idx = 0
    done = False
    total_reward = 0
    all_actions = []
    all_observations = [observation]
    while not done:
        if render:
            env.render()
        if verbose:
            print(observation)
        action = policy(observation)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or step_idx>=max_steps-1:
            print("Episode finished after {} timesteps because {}".format(step_idx+1, "'done' reached" if done else "Max timesteps reached"))
            break
    return total_reward, np.array(all_observations, dtype=np.float64,), np.array(all_actions, dtype=np.float64)

### Model Definition and Fitting

In [6]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import SparseGPRegression, SparseGPRegressionSamplingPrediction
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

def prepare_data(state_list, action_list, win_in):
    """
    Prepares a list of states and a list of actions as inputs to the Gaussian Process for training.
    """
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

def fit_model(state_list, action_list, win_in, verbose=True):
    """
    Fits a Gaussian Process model to the state / action pairs passed in. 
    This creates a model of the environment which is used during
    policy optimization instead of querying the environment directly.
    
    See mxfusion.gp_modules for additional types of GP models to fit,
    including Sparse GP and Stochastic Varitional Inference Sparse GP.
    """
    X, Y = prepare_data(state_list, action_list, win_in)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = SparseGPRegression.define_variable(X=m.X, kernel=m.kernel,
                                             inducing_num = 50,
                                             noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=SparseGPRegressionSamplingPrediction(
                    gp._module_graph,
                    gp._extra_graphs[0],
                    [gp._module_graph.X],
                    jitter = 1e-6,), 
                    alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=1000, learning_rate=0.5, verbose=verbose)
    return m, infr

### Policy Optimization

In [7]:
from mxfusion.inference import GradTransferInference, PILCOAlgorithm, BatchInferenceLoop
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def optimize_policy(policy, cost_func, model, infr, 
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, num_time_steps=100, 
                    num_samples=10, verbose=True):
    """
    Takes as primary inputs a policy, cost function, and trained model.
    Optimizes the policy for num_grad_steps number of iterations.
    """
    
    from mxfusion.inference import GradTransferInference, PILCOAlgorithm, BatchInferenceLoop
    mb_alg = PILCOAlgorithm(model=model, 
                                 observed=[model.X], 
                                 cost_function=cost_func, 
                                 policy=policy, 
                                 n_time_steps=num_time_steps,
                                 initial_state_generator=initial_state_generator,
                                 num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = GradTransferInference(mb_alg, 
                                  infr_params=infr.params, train_params=train_params)
    infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(np.zeros((1,action_dim+obs_dim)), dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate)
    return policy

### Define Cost Function

In [8]:
class TruePendulumCostFunction(mx.gluon.HybridBlock):
    """
    The goal is to get the pendulum upright and stable as quickly as possible.
    Taken from the code for Pendulum.
    """
    def hybrid_forward(self, F, state, action):
        """
        :param state: [np.cos(theta), np.sin(theta), ~ momentum(theta)]
        a -> 0 when pendulum is upright, largest when pendulum is hanging down completely.
        b -> penalty for taking action
        c -> penalty for pendulum momentum
        """
        a_scale = 2.
        b_scale = .001
        c_scale = .1
        a = F.sum(a_scale * (state[:,:,0:1] -1) ** 2, axis=-1)
        b = F.sum(b_scale * action ** 2, axis=-1)
        c = F.sum(c_scale * state[:,:,2:3] ** 2, axis=-1)
        return (a + c + b)
    
cost = TruePendulumCostFunction()

### Define Initial State Generation

In [9]:
def initial_state_generator(num_initial_states):
    """
    Starts from valid states by drawing theta and momentum
    then computing np.cos(theta) and np.sin(theta) for state[0:2].s
    """
    theta = mx.nd.random.uniform(low=0., high=2*np.pi, shape=(num_initial_states, 1), dtype='float64')
    thdot = mx.nd.random.uniform(low=-8, high=8, shape=(num_initial_states, 1), dtype='float64')
    return mx.nd.concat(mx.nd.cos(theta), mx.nd.sin(theta),thdot)

###  Define Policy

In [10]:
from mxnet.gluon.nn import HybridSequential
from mxnet.gluon import HybridBlock

def random_policy(state):
    return env.action_space.sample()

class MultiplyByTwo(HybridBlock):
    def hybrid_forward(self, F, X):
        return X * 2


class EpsilonGreed(HybridBlock):
    """
    With epsilon probability, chooses a random action instead of the computed one.
    """
    def __init__(self, epsilon, bounds, **kwargs):
        super(EpsilonGreed, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.bounds = bounds
    
    def hybrid_forward(self, F, X):
        p = F.random.uniform()
        if p >= self.epsilon:
            return X
        else:
            a = F.random.uniform(low=self.bounds[0],
                                 high=self.bounds[1],
                                 shape=X.shape,
                                 dtype=X.dtype)
            return a

def make_nonlinear_policy(dense_units, epsilon=None):
    """
    Make a simple neural network with one hidden layer.
    If epsilon is passed in it will be an epsilon-greedy policy.
    """
    policy = HybridSequential()
    policy.add(mx.gluon.nn.Dense(dense_units, in_units=obs_dim, dtype='float64', activation='relu'))
    policy.add(mx.gluon.nn.Dense(1, in_units=dense_units, dtype='float64', activation='tanh'))
    policy.add(MultiplyByTwo())
    if epsilon is not None:
        policy.add(EpsilonGreed(epsilon, (-2,2)))
    return policy

epsilon = 0.2
dense_units = 50
policy = make_nonlinear_policy(dense_units, epsilon=epsilon)
# policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
policy.collect_params().initialize(mx.init.Xavier(magnitude=3.))

### Run an episode with a random policy

In [17]:
parameters1 = run_one_episode(env, random_policy, max_steps=200, render=True)

Episode finished after 200 timesteps because Max timesteps reached


### Run the training loop to optimize the policy

In [13]:
all_states = []
all_actions = []

num_episode = 40 # how many model fit + policy optimization episodes to run
num_samples = 20 # how many sample trajectories the policy optimization loop uses
num_grad_steps = 50 # how many gradient steps the optimizer takes per episode
num_time_steps = 250 # how far to roll out each sample trajectory
learning_rate = 1e-3 # learning rate for the policy optimization

"""
If true, the first environment run will be driven with a random policy
instead of the real policy for better exploration
"""
initialize_with_random_policy = False

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0 and initialize_with_random_policy:
        print("Using a random policy to drive the real enviroment to start with.")
        policy_func = random_policy       
    else:
        print("Using a learned policy to drive the real enviroment.")
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, initial_state=INITIAL_GYM_STATE,
                                                    max_steps=num_time_steps, render=True)
    print("Actions:", actions[:5], actions[-5:])
    all_states.append(states)
    all_actions.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr = fit_model(all_states, all_actions, win_in=1, verbose=False)
        
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model,
                             infr,
                             initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             learning_rate=learning_rate,
                             num_time_steps=num_time_steps)

Start Episode 1.
Using a learned policy to drive the real enviroment.
Episode finished after 250 timesteps because Max timesteps reached
Actions: [[1.33215238]
 [0.3080711 ]
 [0.30306303]
 [0.29957521]
 [0.31597629]] [[0.47422693]
 [0.47814984]
 [0.48317796]
 [0.49019598]
 [0.49536993]]
Fit the model.




Optimize the policy.
Iteration 6 loss: 33387.987342385986
Parameter dense0_weight (shape=(50, 3), dtype=float64) 
[[-2.25242156e+03 -2.53446219e+02  1.09424000e+04]
 [-3.29997485e+02  1.63455204e+02 -6.30067680e+02]
 [-2.05610890e+03  3.45686635e+02 -5.15113261e+03]
 [-1.89044416e+02  8.19253275e+00 -2.50152084e+03]
 [-5.88802936e+02  4.27443850e+01 -1.40859534e+03]
 [ 2.52005917e+03  3.23117279e+02 -1.21378082e+04]
 [-3.01184354e+01 -4.75352252e+00 -2.19234449e+03]
 [-3.53790872e+02 -4.43381421e+01  1.83880599e+03]
 [ 4.60137326e-01  7.84260748e+00  8.99371453e+01]
 [-5.56735195e+01  5.17633784e+01 -1.53731267e+02]
 [ 8.07754596e+02 -4.39748004e+01  1.83489187e+03]
 [-8.18355927e+02  8.43466420e+01 -1.81588897e+03]
 [-8.71932150e+02 -1.00486423e+02  4.24807533e+03]
 [ 1.43831650e+03  1.78130380e+02 -6.89888476e+03]
 [ 1.48841163e+03  6.95281375e+02 -1.32910716e+04]
 [ 7.94561595e+02  6.63713291e+01 -4.17301746e+03]
 [-2.40118391e+03 -3.07961088e+02  1.15727934e+04]
 [ 6.40834028e+02  

Iteration 16 loss: 30128.307615137648
Parameter dense0_weight (shape=(50, 3), dtype=float64) 
[[-1.25764634e+03 -2.16902492e+01 -9.31867923e+03]
 [-3.41661242e+02 -1.71781650e+01  3.20349339e+03]
 [-1.46046708e+03 -2.14355387e+02  9.70960228e+03]
 [-1.49532086e+02 -5.30551556e+01  2.90869455e+03]
 [-3.90395465e+02 -6.64228828e+01  2.80068942e+03]
 [ 1.44707475e+03  4.69312988e+01  1.09067687e+04]
 [-2.73865429e+01 -3.91661880e+01  5.56896665e+02]
 [-1.54306323e+02 -1.05197690e+01 -1.41424870e+03]
 [-6.94813642e+00  8.41274659e+00 -5.73478800e+02]
 [-1.23489895e+02 -6.80949433e+01  3.96027247e+03]
 [ 5.16726630e+02  1.11937121e+02 -3.59085209e+03]
 [-5.36990519e+02 -9.71865667e+01  3.62746746e+03]
 [-4.53428796e+02 -5.04303951e+00 -3.41647935e+03]
 [ 8.54649223e+02  1.90197507e+01  6.32342747e+03]
 [ 1.09847787e+03  9.75573122e+01  9.32788567e+03]
 [ 4.87301754e+02 -1.84174105e+01  3.83730328e+03]
 [-1.30465428e+03 -5.14788176e+01 -9.83439692e+03]
 [ 3.90032900e+02  1.99795291e+01  3.05

Iteration 26 loss: 28495.616446302034
Parameter dense0_weight (shape=(50, 3), dtype=float64) 
[[-8.13306535e+02 -2.60378850e+02  3.77401720e+03]
 [-2.67988941e+02  4.73234383e+01 -1.10138122e+03]
 [-1.05590607e+03  7.08423619e+01 -4.18039330e+03]
 [-1.13125881e+02  3.96753348e+01 -1.35777370e+03]
 [-2.93933148e+02  2.03262034e+01 -1.17533695e+03]
 [ 9.64506276e+02  3.39809779e+02 -4.48363942e+03]
 [-2.43878405e+01  1.12770087e+01 -9.23580049e+02]
 [-1.11271650e+02 -4.15580480e+01  5.53131066e+02]
 [-6.65313501e+00 -4.80261905e+00  2.38726615e+01]
 [-8.59505391e+01  1.19090225e+01 -6.05939738e+02]
 [ 3.89229638e+02 -3.27811525e+00  1.55693432e+03]
 [-3.97941786e+02  1.71401498e+01 -1.54216673e+03]
 [-2.95086486e+02 -1.03425883e+02  1.37173987e+03]
 [ 5.63705171e+02  1.97776288e+02 -2.61034400e+03]
 [ 7.72117281e+02  3.86618080e+02 -4.50791992e+03]
 [ 3.38013467e+02  1.07748481e+02 -1.64257862e+03]
 [-8.57269095e+02 -3.09030997e+02  4.00468451e+03]
 [ 2.66059948e+02  9.84010543e+01 -1.25

Iteration 31 loss: 30607.467192578162
Parameter dense0_weight (shape=(50, 3), dtype=float64) 
[[-1.00726726e+03 -5.88764789e+02  8.81926252e+03]
 [-2.96324230e+02  1.26333902e+02 -1.51084799e+03]
 [-1.31603074e+03  2.96678048e+02 -5.71476581e+03]
 [-1.32759200e+02  1.10433478e+02 -1.72551787e+03]
 [-3.71922430e+02  8.16330231e+01 -1.62943693e+03]
 [ 1.17240880e+03  7.26384094e+02 -1.02795157e+04]
 [-2.98857903e+01  2.50964163e+01 -1.23844236e+03]
 [-1.33382183e+02 -9.32080496e+01  1.36448059e+03]
 [-2.96727119e+01 -3.57137569e+01  4.62444514e+02]
 [-1.40058030e+02  1.26431460e+02 -1.21880319e+03]
 [ 4.87294879e+02 -8.30613677e+01  2.13746776e+03]
 [-5.04896107e+02  1.03693065e+02 -2.11360031e+03]
 [-3.67183116e+02 -2.23343327e+02  3.23025522e+03]
 [ 6.82682136e+02  4.15682406e+02 -5.95942267e+03]
 [ 7.21519220e+02  7.88000176e+02 -1.03746256e+04]
 [ 3.66132827e+02  2.36441045e+02 -3.76157837e+03]
 [-1.06476117e+03 -6.66927669e+02  9.32370563e+03]
 [ 3.18106064e+02  2.05190491e+02 -2.82

Iteration 37 loss: 29078.60966125523

KeyboardInterrupt: 

### Test the trained policy

In [None]:
parameters1 = run_one_episode(env, policy, max_steps=3000, render=True)