In [1]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'

In [2]:
import gym
env = gym.make('MountainCarContinuous-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [22]:
import numpy as np

def run_one_episode(env, policy, max_steps=None, verbose=False, render=False):
    observation = env.reset()
    step_idx = 0
    done = False
    total_reward = 0
    all_observations = [np.array(observation)]
    all_actions = []
    while not done:
        if render:
            env.render()
        if verbose:
            print(observation)
        action = policy(observation)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or (max_steps is not None and step_idx>=max_steps-1):
            print("Episode finished after {} timesteps".format(step_idx+1))
            break
    return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64)

In [14]:
def random_policy(state):
    return [np.random.rand()*2-1]

In [4]:
def prepare_data(state_list, action_list, win_in):
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

In [30]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

    
def fit_model(state_list, action_list, win_in, verbose=False):
    X, Y = prepare_data(state_list, action_list, win_in)
    
#     Y_mean = Y.mean()
#     Y_std = Y.std()
#     Y = (Y-Y_mean)/(Y_std + 1e-10)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = GPRegression.define_variable(X=m.X, kernel=m.kernel, noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    m.Y.factor.gp_log_pdf.jitter = 1e-6
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=GPRegressionSamplingPrediction(
                    gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]), 
                alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=100, learning_rate=0.5, verbose=verbose)
    return m, infr #, Y_mean, Y_std

In [45]:
all_states = []
all_actons = []

In [46]:
total_reward, states, actions = run_one_episode(env, random_policy, max_steps=100)
all_states.append(states)
all_actons.append(actions)

Episode finished after 100 timesteps


In [47]:
model, infr = fit_model(all_states, all_actons, win_in=1)

Iteration 11 loss: -715.0870941751563
Iteration 21 loss: -1073.0623347819194
Iteration 31 loss: -1118.3911558269015
Iteration 41 loss: -1121.8266419700898
Iteration 51 loss: -1123.8544475845758
Iteration 61 loss: -1125.1624248402813
Iteration 71 loss: -1125.9986969495155
Iteration 81 loss: -1126.6653574107725
Iteration 91 loss: -1127.2053957573646
Iteration 100 loss: -1127.630256266021

In [26]:
from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop

def optimize_policy(policy, cost_func, model, infr, initial_states, num_grad_steps, learning_rate=1e-2, num_time_steps=100, num_samples=10, verbose=True, n_prints=10):
    mb_alg = ModelBasedAlgorithm(model=model, 
                                 observed=[model.X], 
                                 cost_function=cost_func, 
                                 policy=policy, 
                                 n_time_steps=num_time_steps,
                                 initial_state=initial_states, num_samples=num_samples)
    infr_pred = GradTransferInference(mb_alg, 
                                      infr_params=infr.params)
    
    trainer = mx.gluon.Trainer(policy.collect_params(),
                               optimizer='adam',
                               optimizer_params={'learning_rate':
                                                 learning_rate})
    iter_step = max(num_grad_steps // n_prints, 1)
    for i in range(num_grad_steps):
        with mx.autograd.record():
            loss_for_gradient = infr_pred.run(X=mx.nd.array(np.zeros((1,3)), dtype='float64'), verbose=True)[0]
            loss_for_gradient.backward()
        if verbose:
            print('\rIteration {} loss: {}'.format(i + 1, loss_for_gradient.asscalar()),
                  end='')
            if i % iter_step == 0 and i > 0:
                print()
        trainer.step(batch_size=1, ignore_stale_grad=True)
    return policy

In [7]:
class CostFunction(mx.gluon.HybridBlock):
    def hybrid_forward(self, F, x):
        return F.sum((x[:,:,0:1] - 0.45)**2, axis=-1)
    
cost = CostFunction()

In [50]:
initial_state = mx.nd.array([[10., 0]], dtype='float64') # TODO want a proposal distribution here instead of same.
linear_policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
linear_policy.collect_params().initialize(mx.init.Constant(0))

In [51]:
optimize_policy(linear_policy, cost, model, infr, initial_state, num_grad_steps=500)

Iteration 1 loss: 1128.661684575488



Iteration 51 loss: 886.76195960013247
Iteration 101 loss: 680.8295883913074
Iteration 151 loss: 613.6087695956486
Iteration 201 loss: 598.4990246624612
Iteration 251 loss: 604.59476011392335
Iteration 301 loss: 488.04544184232117
Iteration 351 loss: 412.35642645646953
Iteration 401 loss: 441.32428483916243
Iteration 451 loss: 410.05913475920187
Iteration 500 loss: 476.74095051006924

Dense(2 -> 1, linear)

# Full PILCO

In [24]:
env.observation_space.sample()

array([-0.2121357 ,  0.03012651], dtype=float32)

In [31]:
# Initial step

cost = CostFunction()
policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
policy.collect_params().initialize(mx.init.Constant(0))

all_states = []
all_actons = []

num_episode = 10
num_samples = 10

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0:
        policy_func = random_policy
    else:
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, max_steps=100, render=True)
    all_states.append(states)
    all_actons.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr = fit_model(all_states, all_actons, win_in=1, verbose=False)
    
    initial_state = mx.nd.array(env.observation_space.sample()[None,:], dtype='float64')
    
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model, infr, initial_state, num_grad_steps=500, 
                             num_samples=num_samples, learning_rate=1e-1)

Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 51 loss: 324.73691639350056
Iteration 101 loss: 348.38366805579176
Iteration 151 loss: 296.61599860145683
Iteration 201 loss: 291.04795194208975
Iteration 251 loss: 308.28248005763183
Iteration 301 loss: 290.44231868810135
Iteration 351 loss: 271.26223955134463
Iteration 401 loss: 259.59560418354766
Iteration 451 loss: 255.81599553249623
Iteration 500 loss: 241.18436722712633Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 51 loss: 296.97328512796815
Iteration 101 loss: 330.50228379947683
Iteration 151 loss: 285.55035519206354
Iteration 201 loss: 307.42084914103816
Iteration 251 loss: 302.62367196917756
Iteration 301 loss: 322.00147128213366
Iteration 351 loss: 300.46278218699075
Iteration 401 loss: 293.72044580923894
Iteration 451 loss: 313.47061331105944
Iteration 500 loss: 303.0943083723705Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iterat

KeyboardInterrupt: 

1. update the optimize policy fuction.
2. multiple initial sates.
3. use the real reward function. (https://github.com/openai/gym/blob/master/gym/envs/classic_control/continuous_mountain_car.py)
4. visualize the intermediate and final performance of policy.
5. Make a notebook to show.
6. Try a non-linear policy.