In [None]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'

### Problem Setup

RL

Experiments/simulations expensive (computationally expensive or physically limited / expensive) (robot motor example)
-> Model of the physical world -> Data efficiency with experimental/simulated data

Model wants to know what parts of the world it doesn't know. -> GPs




### Define Environment

In [None]:
import gym
env = gym.make('Pendulum-v0')
action_dim = 1
obs_dim = 3

In [None]:
import numpy as np
INITIAL_GYM_STATE = [np.pi, 1.]
INITIAL_STATE = [np.cos(INITIAL_GYM_STATE[0]), np.sin(INITIAL_GYM_STATE[0]), INITIAL_GYM_STATE[1]]

### Run Environment

In [None]:
import numpy as np

def run_one_episode(env, policy, initial_state=None, max_steps=200, verbose=False, render=False):
    """
    Drives an episode of the OpenAI gym environment using the policy to decide next actions.
    """
    observation = env.reset()
    if initial_state is not None:
        env.env.state = initial_state
        observation = env.env._get_obs()
    env._max_episode_steps = max_steps
    step_idx = 0
    done = False
    total_reward = 0
    all_actions = []
    all_observations = [observation]
    while not done:
        if render:
            env.render()
        if verbose:
            print(observation)
        action = policy(observation)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or step_idx>=max_steps-1:
            print("Episode finished after {} timesteps because {}".format(step_idx+1, "'done' reached" if done else "Max timesteps reached"))
            break
    return total_reward, np.array(all_observations, dtype=np.float64,), np.array(all_actions, dtype=np.float64)

### Model Definition and Fitting

In [None]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import SparseGPRegression, SparseGPRegressionSamplingPrediction
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

def prepare_data(state_list, action_list, win_in):
    """
    Prepares a list of states and a list of actions as inputs to the Gaussian Process for training.
    """
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

def fit_model(state_list, action_list, win_in, verbose=True):
    """
    Fits a Gaussian Process model to the state / action pairs passed in. 
    This creates a model of the environment which is used during
    policy optimization instead of querying the environment directly.
    
    See mxfusion.gp_modules for additional types of GP models to fit,
    including Sparse GP and Stochastic Varitional Inference Sparse GP.
    """
    X, Y = prepare_data(state_list, action_list, win_in)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = SparseGPRegression.define_variable(X=m.X, kernel=m.kernel,
                                             inducing_num = 50,
                                             noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=SparseGPRegressionSamplingPrediction(
                    gp._module_graph,
                    gp._extra_graphs[0],
                    [gp._module_graph.X],
                    jitter = 1e-6,), 
                    alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=1000, learning_rate=0.5, verbose=verbose)
    return m, infr, X, Y

### Policy Optimization

In [None]:
from mxfusion.inference import GradTransferInference, PILCOAlgorithm, BatchInferenceLoop
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def optimize_policy(policy, cost_func, model, infr, model_data_X, model_data_Y,
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, num_time_steps=100, 
                    num_samples=10, verbose=True):
    """
    Takes as primary inputs a policy, cost function, and trained model.
    Optimizes the policy for num_grad_steps number of iterations.
    """
    
    from mxfusion.inference.pilco_alg import PolicyUpdateGPParametricApprox
    mb_alg = PolicyUpdateGPParametricApprox(model=model, 
                            observed=[model.X, model.Y], 
                            cost_function=cost_func, 
                            policy=policy, 
                            n_time_steps=num_time_steps,
                            initial_state_generator=initial_state_generator,
                            num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = GradTransferInference(mb_alg, 
                                  infr_params=infr.params, train_params=train_params)
    infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(model_data_X, dtype='float64'),
                  Y=mx.nd.array(model_data_Y, dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate)
    return policy

### Define Cost Function

In [None]:
class TruePendulumCostFunction(mx.gluon.HybridBlock):
    """
    The goal is to get the pendulum upright and stable as quickly as possible.
    Taken from the code for Pendulum.
    """
    def hybrid_forward(self, F, state, action):
        """
        :param state: [np.cos(theta), np.sin(theta), ~ momentum(theta)]
        a -> 0 when pendulum is upright, largest when pendulum is hanging down completely.
        b -> penalty for taking action
        c -> penalty for pendulum momentum
        """
        a_scale = 2.
        b_scale = .001
        c_scale = .1
        a = F.sum(a_scale * (state[:,:,0:1] -1) ** 2, axis=-1)
        b = F.sum(b_scale * action ** 2, axis=-1)
        c = F.sum(c_scale * state[:,:,2:3] ** 2, axis=-1)
        return (a + c + b)
    
cost = TruePendulumCostFunction()

### Define Initial State Generation

In [None]:
def initial_state_generator(num_initial_states):
    """
    Starts from valid states by drawing theta and momentum
    then computing np.cos(theta) and np.sin(theta) for state[0:2].s
    """
    theta = mx.nd.random.uniform(low=0., high=2*np.pi, shape=(num_initial_states, 1), dtype='float64')
    thdot = mx.nd.random.uniform(low=-8, high=8, shape=(num_initial_states, 1), dtype='float64')
    return mx.nd.concat(mx.nd.cos(theta), mx.nd.sin(theta),thdot)

###  Define Policy

In [None]:
from mxnet.gluon.nn import HybridSequential
from mxnet.gluon import HybridBlock

def random_policy(state):
    return env.action_space.sample()

class MultiplyByTwo(HybridBlock):
    def hybrid_forward(self, F, X):
        return X * 2


class EpsilonGreed(HybridBlock):
    """
    With epsilon probability, chooses a random action instead of the computed one.
    """
    def __init__(self, epsilon, bounds, **kwargs):
        super(EpsilonGreed, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.bounds = bounds
    
    def hybrid_forward(self, F, X):
        p = F.random.uniform()
        if p >= self.epsilon:
            return X
        else:
            a = F.random.uniform(low=self.bounds[0],
                                 high=self.bounds[1],
                                 shape=X.shape,
                                 dtype=X.dtype)
            return a

def make_nonlinear_policy(dense_units, epsilon=None):
    """
    Make a simple neural network with one hidden layer.
    If epsilon is passed in it will be an epsilon-greedy policy.
    """
    policy = HybridSequential()
    policy.add(mx.gluon.nn.Dense(dense_units, in_units=obs_dim, dtype='float64', activation='relu'))
    policy.add(mx.gluon.nn.Dense(1, in_units=dense_units, dtype='float64', activation='tanh'))
    policy.add(MultiplyByTwo())
    if epsilon is not None:
        policy.add(EpsilonGreed(epsilon, (-2,2)))
    return policy

epsilon = 0.2
dense_units = 50
policy = make_nonlinear_policy(dense_units, epsilon=epsilon)
# policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
policy.collect_params().initialize(mx.init.Xavier(magnitude=3.))

### Run an episode with a random policy

In [None]:
results = run_one_episode(env, random_policy, max_steps=200, render=True)

### Run the training loop to optimize the policy

In [None]:
all_states = []
all_actions = []

num_episode = 40 # how many model fit + policy optimization episodes to run
num_samples = 20 # how many sample trajectories the policy optimization loop uses
num_grad_steps = 50 # how many gradient steps the optimizer takes per episode
num_time_steps = 100 # how far to roll out each sample trajectory
learning_rate = 1e-3 # learning rate for the policy optimization

"""
If true, the first environment run will be driven with a random policy
instead of the real policy for better exploration
"""
initialize_with_random_policy = False

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0 and initialize_with_random_policy:
        print("Using a random policy to drive the real enviroment to start with.")
        policy_func = random_policy       
    else:
        print("Using a learned policy to drive the real enviroment.")
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, initial_state=INITIAL_GYM_STATE,
                                                    max_steps=num_time_steps, render=True)
    print("Actions:", actions[:5], actions[-5:])
    all_states.append(states)
    all_actions.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr, model_data_X, model_data_Y = fit_model(all_states, all_actions, win_in=1, verbose=False)
        
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model,
                             infr,
                             model_data_X, model_data_Y,
                             initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             learning_rate=learning_rate,
                             num_time_steps=num_time_steps)

### Test the trained policy

In [None]:
parameters1 = run_one_episode(env, policy, max_steps=3000, render=True)