In [None]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'

In [None]:
import gym
env = gym.make('MountainCarContinuous-v0')

In [None]:
import numpy as np

def run_one_episode(env, policy, max_steps=None, verbose=False, render=False):
    observation = env.reset()
    step_idx = 0
    done = False
    total_reward = 0
    all_observations = [np.array(observation)]
    all_actions = []
    while not done:
        if render:
            env.render()
        if verbose:
            print(observation)
        action = policy(observation)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or (max_steps is not None and step_idx>=max_steps-1):
            print("Episode finished after {} timesteps".format(step_idx+1))
            break
    return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64)

In [None]:
def random_policy(state):
    return [np.random.rand()*2-1]

In [None]:
def prepare_data(state_list, action_list, win_in):
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

In [None]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

    
def fit_model(state_list, action_list, win_in, verbose=False):
    X, Y = prepare_data(state_list, action_list, win_in)
    
#     Y_mean = Y.mean()
#     Y_std = Y.std()
#     Y = (Y-Y_mean)/(Y_std + 1e-10)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = GPRegression.define_variable(X=m.X, kernel=m.kernel, noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    m.Y.factor.gp_log_pdf.jitter = 1e-6
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=GPRegressionSamplingPrediction(
                    gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]), 
                alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=100, learning_rate=0.5, verbose=verbose)
    return m, infr #, Y_mean, Y_std

In [None]:
all_states = []
all_actons = []

In [None]:
total_reward, states, actions = run_one_episode(env, random_policy, max_steps=100)
all_states.append(states)
all_actons.append(actions)

In [None]:
model, infr = fit_model(all_states, all_actons, win_in=1)

In [None]:
from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def optimize_policy(policy, cost_func, model, infr, 
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, num_time_steps=100, 
                    num_samples=10, verbose=True):
    
    from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop
    mb_alg = ModelBasedAlgorithm(model=model, 
                                 observed=[model.X], 
                                 cost_function=cost_func, 
                                 policy=policy, 
                                 n_time_steps=num_time_steps,
                                 initial_state_generator=initial_state_generator,
                                 num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = GradTransferInference(mb_alg, 
                                  infr_params=infr.params, train_params=train_params)
    infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(np.zeros((1,3)), dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate)
    return policy

In [None]:
class CostFunction(mx.gluon.HybridBlock):
    def hybrid_forward(self, F, x):
        return F.sum(10*(x[:,:,0:1] - 0.45)**2, axis=-1)
    
cost = CostFunction()

In [None]:
def initial_state_generator(num_initial_states):
    initial_pos = mx.nd.random.uniform(low=-1.2, high=0.6, shape=(num_initial_states, 1), dtype='float64')
    initial_vel = mx.nd.random.uniform(low=-0.07, high=0.07, shape=(num_initial_states, 1), dtype='float64')
    return mx.nd.concat(initial_pos, initial_vel, dim=-1)

In [None]:
initial_state_generator(10)

# Full PILCO

In [None]:
env.observation_space.sample()

In [None]:
from mxnet.gluon.nn import HybridSequential
# Initial step

cost = CostFunction()
policy = HybridSequential()
policy.add(mx.gluon.nn.Dense(1, in_units=2, dtype='float64', activation='tanh'))
# policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
policy.collect_params().initialize(mx.init.Xavier(magnitude=3.))

In [None]:
all_states = []
all_actons = []

num_episode = 10
num_samples = 10

num_grad_steps = 500
learning_rate = 1e-1

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0:
        policy_func = random_policy
    else:
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, max_steps=100, render=True)
    all_states.append(states)
    all_actons.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr = fit_model(all_states, all_actons, win_in=1, verbose=False)
    
    initial_state = mx.nd.array(env.observation_space.sample()[None,:], dtype='float64')
    
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model,
                             infr, initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             learning_rate=learning_rate)

1. update the optimize policy fuction.
2. multiple initial states.
3. use the real reward function. (https://github.com/openai/gym/blob/master/gym/envs/classic_control/continuous_mountain_car.py)
4. visualize the intermediate and final performance of policy.
5. Make a notebook to show.
6. Try a non-linear policy.
7. add epsilon-greedy policy

In [None]:
parameters1 = run_one_episode(env, policy_func, max_steps=10000, render=True)

In [None]:
parameters1[2]