In [1]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'
import mxnet as mx
mx.context.Context.default_ctx = mx.gpu()

In [2]:
import gym
env = gym.make('Pendulum-v0')

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import animation, rc
from IPython.display import HTML


def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=20)                               
    return anim
#     HTML(anim.to_html5_video())
#     HTML(anim.to_jshtml())
#     display(display_animation(anim, default_mode='loop'))

In [4]:
import numpy as np

def run_one_episode(env, policy, max_steps=None, verbose=False, render=False):
    observation = env.reset()
    step_idx = 0
    done = False
    total_reward = 0
    all_observations = [np.array(observation)]
    all_actions = []
    if render:
        frames = []
    while not done:
        if render:
            frames.append(env.render(mode = 'rgb_array'))
        action = policy(observation)
        if verbose:
            print(observation, action)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or (max_steps is not None and step_idx>=max_steps-1):
            print("Episode finished after {} timesteps".format(step_idx+1))
            break
    if render:
        anim = display_frames_as_gif(frames)
        return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64), anim
    else:
        return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64)


In [5]:
def random_policy(state):
    return [np.random.rand()*2-1]

In [6]:
def prepare_data(state_list, action_list, win_in):
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

In [7]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

    
def fit_model(state_list, action_list, win_in, verbose=False):
    X, Y = prepare_data(state_list, action_list, win_in)
    
#     Y_mean = Y.mean()
#     Y_std = Y.std()
#     Y = (Y-Y_mean)/(Y_std + 1e-10)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = GPRegression.define_variable(X=m.X, kernel=m.kernel, noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    m.Y.factor.gp_log_pdf.jitter = 1e-6
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=GPRegressionSamplingPrediction(
                    gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]), 
                alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=100, learning_rate=0.5, verbose=verbose)
    return m, infr, X, Y #, Y_mean, Y_std

In [13]:
from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop, TransferInference
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def optimize_policy(policy, cost_func, model, infr, model_data_X, model_data_Y,
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, momentum=0, num_time_steps=100, 
                    num_samples=10, verbose=True):
    
    from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop
    from mxfusion.inference.model_based_alg import PolicyUpdateGPParametricApprox
    mb_alg = PolicyUpdateGPParametricApprox(model=model, 
                                 observed=[model.X, model.Y], 
                                 cost_function=cost_func, 
                                 policy=policy, 
                                 n_time_steps=num_time_steps,
                                 initial_state_generator=initial_state_generator,
                                 num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = GradTransferInference(mb_alg, 
                                  infr_params=infr.params, train_params=train_params)
    infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(model_data_X, dtype='float64'), Y=mx.nd.array(model_data_Y, dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate, momentum=momentum)
    return policy

# def optimize_policy(policy, cost_func, model, infr, model_data_X, model_data_Y,
#                     initial_state_generator, num_grad_steps,
#                     learning_rate=1e-2, num_time_steps=100, 
#                     num_samples=10, verbose=True):
    
#     from mxfusion.inference import GradTransferInference, ModelBasedAlgorithm, BatchInferenceLoop
#     from mxfusion.inference.model_based_alg import PolicyUpdateGPParametricApprox
#     mb_alg = ModelBasedAlgorithm(model=model, 
#                                  observed=[model.X], 
#                                  cost_function=cost_func, 
#                                  policy=policy, 
#                                  n_time_steps=num_time_steps,
#                                  initial_state_generator=initial_state_generator,
#                                  num_samples=num_samples)
    
#     train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
#     infr_pred = GradTransferInference(mb_alg, 
#                                   infr_params=infr.params, train_params=train_params)
#     infr_pred.run(max_iter=num_grad_steps, 
#                   X=mx.nd.array(model_data_X, dtype='float64'),
#                   verbose=verbose,
#                   learning_rate=learning_rate)
#     return policy

In [14]:
class CostFunction(mx.gluon.HybridBlock):
    def hybrid_forward(self, F, x):
        return F.sum(2*(x[:,:,0:1] -1)**2 + .1*x[:,:,2:3]**2, axis=-1)
    
cost = CostFunction()

In [15]:
def initial_state_generator(num_initial_states):
    return mx.nd.array([env.observation_space.sample() for i in range(num_initial_states)] , dtype='float64')
#     return mx.nd.array([[0, 1, 0]], dtype='float64')

In [16]:
class Controller(mx.gluon.HybridBlock):
    def __init__(self, prefix=None, params=None):
        super(Controller, self).__init__(prefix=prefix, params=params)
        self.dense1 = mx.gluon.nn.Dense(50, in_units=3, dtype='float64', prefix='dense1_', activation='tanh')
        self.dense2 = mx.gluon.nn.Dense(1, in_units=50, dtype='float64', prefix='dense2_', activation='tanh')
    def hybrid_forward(self, F, x):
        out = self.dense2(self.dense1(x))*2
        return out #F.clip(out, -2, 2)

In [17]:
initial_state_generator(10)


[[-0.47088876  0.5484674  -0.70159471]
 [ 0.1368679  -0.9624204   1.88216794]
 [ 0.22419144  0.23386799  7.09996939]
 [ 0.36364061 -0.28098419 -1.00748873]
 [ 0.39526239 -0.87954909  2.66826749]
 [ 0.34127575 -0.5792349  -5.93717909]
 [-0.36914331 -0.27257845  1.12314832]
 [-0.12279698  0.97674769 -6.36728287]
 [-0.58224648 -0.67738098  2.44973326]
 [-0.49341679 -0.06737845 -4.08919048]]
<NDArray 10x3 @gpu(0)>

In [18]:
# optimize_policy(linear_policy, cost, model, infr, initial_state_generator, num_grad_steps=500, learning_rate=1e-1)

# Full PILCO

In [19]:
# Initial step

cost = CostFunction()

# policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64', activation='tanh', prefix='dense_')
policy = Controller()
policy.collect_params().initialize(mx.initializer.Xavier(magnitude=1))
# policy.collect_params()['dense_weight'].set_data(mx.nd.array(np.random.randn(1,3), dtype='float64'))
# policy.collect_params()['dense_bias'].set_data(mx.nd.array(np.random.randn(1), dtype='float64'))

all_states = []
all_actons = []

num_episode = 20
num_samples = 100

num_grad_steps = 200
learning_rate = 1e-3
momentum = None

n_retries = 2

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0:
        policy_func = random_policy
    else:
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, max_steps=100, render=False)
    all_states.append(states)
    all_actons.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr, model_data_X, model_data_Y = fit_model(all_states, all_actons, win_in=1, verbose=False)
    
    initial_state = mx.nd.array(env.observation_space.sample()[None,:], dtype='float64')
    
    # Optimize the policy.
    print('Optimize the policy.')
#     policies = []
#     scores = []
#     for i in range(n_retries):
    policy = optimize_policy(policy, cost, model,
                             infr, model_data_X, model_data_Y, initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             num_time_steps=100,
                             learning_rate=learning_rate, momentum=momentum)
#         policies.append(policy)
#         scores.append(score)
#         if i < n_retries-1:
#             policy = LinearController()
#             policy.collect_params().initialize()
#             policy.collect_params()['dense_weight'].set_data(mx.nd.array(np.random.randn(1,2)*5, dtype='float64'))
#             policy.collect_params()['dense_bias'].set_data(mx.nd.array(np.random.randn(1)*5, dtype='float64'))
#     print()
#     print(scores)
#     policy = policies[np.argmin(scores)]

Start Episode 1.
Episode finished after 100 timesteps
Fit the model.




Optimize the policy.
Iteration 21 loss: 536.6989388274932				
Iteration 41 loss: 540.9356310977009			
Iteration 61 loss: 546.4806935937497			
Iteration 81 loss: 513.0374396095021			
Iteration 101 loss: 540.3277068192117			
Iteration 121 loss: 510.2596982695519				
Iteration 141 loss: 519.6777978569775				
Iteration 161 loss: 504.68989218272503			
Iteration 181 loss: 504.11917588006736			
Iteration 200 loss: 519.1583384143808				Start Episode 2.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 21 loss: 593.3852109170084			
Iteration 41 loss: 528.6397783987336			
Iteration 61 loss: 516.3607802665642				
Iteration 81 loss: 523.1279481139561				
Iteration 101 loss: 523.8419760089201				
Iteration 121 loss: 497.519062525686					
Iteration 141 loss: 522.4229016396534				
Iteration 161 loss: 532.2371318504001				
Iteration 181 loss: 517.7914075009136				
Iteration 200 loss: 511.58524523442196			Start Episode 3.
Episode finished after 100 timesteps
Fit the mod

Iteration 61 loss: 373.22584182202974			
Iteration 81 loss: 392.3060803029719				
Iteration 101 loss: 392.40483055824006			
Iteration 121 loss: 384.61515586753956			
Iteration 141 loss: 379.0871681105746				
Iteration 161 loss: 379.60583356004054			
Iteration 181 loss: 391.2478367081629				
Iteration 200 loss: 382.33663434833636			Start Episode 19.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 21 loss: 373.0286417781532				
Iteration 41 loss: 392.15123278227964			
Iteration 61 loss: 363.0639431335255				
Iteration 81 loss: 386.67682491159917			
Iteration 101 loss: 385.63270967029786			
Iteration 121 loss: 363.7373839139558				
Iteration 141 loss: 368.44148917847843			
Iteration 161 loss: 380.04465544565574			
Iteration 181 loss: 378.6802268029749				
Iteration 200 loss: 384.15783008299593			Start Episode 20.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 21 loss: 401.5071707921794				
Iteration 41 loss: 371.363084

In [21]:
num_episode = 80
num_grad_steps = 300
learning_rate = 1e-3

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0:
        policy_func = random_policy
    else:
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    total_reward, states, actions = run_one_episode(env, policy_func, max_steps=100, render=False)
    all_states.append(states)
    all_actons.append(actions)
    
    # Fit a model.
    print('Fit the model.')
    model, infr, model_data_X, model_data_Y = fit_model(all_states, all_actons, win_in=1, verbose=False)
    
    initial_state = mx.nd.array(env.observation_space.sample()[None,:], dtype='float64')
    
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model,
                             infr, model_data_X, model_data_Y, initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             num_time_steps=100, 
                             learning_rate=learning_rate)

Start Episode 1.
Episode finished after 100 timesteps
Fit the model.




Optimize the policy.
Iteration 31 loss: 356.0213495276767				
Iteration 61 loss: 379.15706050894437			
Iteration 91 loss: 363.61406229656933			
Iteration 121 loss: 352.13767195440346			
Iteration 151 loss: 370.79124480092946			
Iteration 181 loss: 415.0024992566608				
Iteration 211 loss: 413.6583535637743				
Iteration 241 loss: 406.28727699498666			
Iteration 271 loss: 410.9122680891474				
Iteration 300 loss: 378.010685977479					Start Episode 2.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 345.6609988735587				
Iteration 61 loss: 349.039043436732					
Iteration 91 loss: 360.4454243410408				
Iteration 121 loss: 367.57647887746566			
Iteration 151 loss: 372.07194702553863			
Iteration 181 loss: 383.50313947458477			
Iteration 211 loss: 383.63227154648325			
Iteration 241 loss: 364.76345716714553			
Iteration 271 loss: 363.07280719234683			
Iteration 300 loss: 453.95798694002605			Start Episode 3.
Episode finished after 100 timesteps
Fit

Iteration 61 loss: 256.85675727910234			
Iteration 91 loss: 249.0897219283795				
Iteration 121 loss: 224.5829090086954				
Iteration 151 loss: 236.14832824887674			
Iteration 181 loss: 238.9553213178432				
Iteration 211 loss: 226.30619723812416			
Iteration 241 loss: 250.9770340157791				
Iteration 271 loss: 244.0396285319947				
Iteration 300 loss: 216.91090168350945			Start Episode 18.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 251.49052309814132			
Iteration 61 loss: 225.30662012047776			
Iteration 91 loss: 227.73380927316222			
Iteration 121 loss: 237.802444040973					
Iteration 151 loss: 235.42280987771267			
Iteration 181 loss: 242.69602121820637			
Iteration 211 loss: 240.49280109919476			
Iteration 241 loss: 241.20278706383618			
Iteration 271 loss: 241.49643921450985			
Iteration 300 loss: 233.74718626864924			Start Episode 19.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 238.0755

Iteration 91 loss: 203.66019948780374			
Iteration 121 loss: 209.918215956522					
Iteration 151 loss: 195.33337510095177			
Iteration 181 loss: 192.8004666991584				
Iteration 211 loss: 195.20024380927876			
Iteration 241 loss: 233.14353522090494			
Iteration 271 loss: 242.49638087634466			
Iteration 300 loss: 212.64003976670264			Start Episode 34.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 202.4095892113306				
Iteration 61 loss: 272.61913782698184			
Iteration 91 loss: 233.0372518603656				
Iteration 121 loss: 248.4610021493088				
Iteration 151 loss: 265.81209024375266			
Iteration 181 loss: 243.30267284194807			
Iteration 211 loss: 237.41829717654025			
Iteration 241 loss: 226.16852793673806			
Iteration 271 loss: 219.4715216125207				
Iteration 300 loss: 215.87993585030065			Start Episode 35.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 220.58012038645077			
Iteration 61 loss: 177.3211

Iteration 121 loss: 203.3641736845871				
Iteration 151 loss: 185.34516471942118			
Iteration 181 loss: 177.67662009835865			
Iteration 211 loss: 215.97754922885187			
Iteration 241 loss: 187.72948988571252			
Iteration 271 loss: 192.8662813803843				
Iteration 300 loss: 193.72889275668288			Start Episode 50.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 186.80289003049268			
Iteration 61 loss: 191.23497782605745			
Iteration 91 loss: 181.95053777891647			
Iteration 121 loss: 179.93607684169558			
Iteration 151 loss: 208.7210067075965				
Iteration 181 loss: 215.56135388480587			
Iteration 211 loss: 215.93124347871526			
Iteration 241 loss: 207.39686648531872			
Iteration 271 loss: 208.1211725915152				
Iteration 300 loss: 173.1416898283395				Start Episode 51.
Episode finished after 100 timesteps
Fit the model.
Optimize the policy.
Iteration 31 loss: 183.84484280564564			
Iteration 61 loss: 185.07336583175905			
Iteration 91 loss: 182.1328

MXNetError: [12:04:46] src/storage/./pooled_storage_manager.h:119: cudaMalloc failed: out of memory

Stack trace returned 10 entries:
[bt] (0) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36161a) [0x7f43a86f561a]
[bt] (1) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x361c31) [0x7f43a86f5c31]
[bt] (2) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x310838b) [0x7f43ab49c38b]
[bt] (3) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x310d115) [0x7f43ab4a1115]
[bt] (4) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aea416) [0x7f43aae7e416]
[bt] (5) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aea6c7) [0x7f43aae7e6c7]
[bt] (6) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aeaae7) [0x7f43aae7eae7]
[bt] (7) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a42ad9) [0x7f43aadd6ad9]
[bt] (8) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a46acb) [0x7f43aaddaacb]
[bt] (9) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a454bf) [0x7f43aadd94bf]



In [16]:
model, infr, model_data_X, model_data_Y

(Model (62ae5)
 Variable Y (a23e2) ~ GPRegression(X=Variable X (051b1), noise_var=Variable noise_var (985ca)),
 <mxfusion.inference.grad_based_inference.GradBasedInference at 0x7f9826a60eb8>,
 array([[-0.54760231, -0.83673873, -0.01615798,  0.86967763],
        [-0.56889289, -0.82241163, -0.51326039,  0.47127194],
        [-0.61163688, -0.79113862, -1.05937832,  0.17301347],
        ...,
        [-0.89800275,  0.43998985,  0.56750647, -2.        ],
        [-0.91074475,  0.41296972,  0.59749885, -2.        ],
        [-0.92286139,  0.38513225,  0.60722615, -2.        ]]),
 array([[-0.56889289, -0.82241163, -0.51326039],
        [-0.61163688, -0.79113862, -1.05937832],
        [-0.67389419, -0.73882787, -1.62678027],
        ...,
        [-0.91074475,  0.41296972,  0.59749885],
        [-0.92286139,  0.38513225,  0.60722615],
        [-0.93392824,  0.35746054,  0.59607533]]))

In [21]:
model_data_Y.max(0)

array([-0.36840844,  0.01382462])

In [16]:
from mxfusion.modules.gp_modules.gp_regression import GPRegressionMeanVariancePrediction
gp = model.Y.factor
gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
            algorithm=GPRegressionMeanVariancePrediction(
                gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]),
            alg_name='gp_predict')

from mxfusion.inference import TransferInference, ModulePredictionAlgorithm
infr_pred = TransferInference(ModulePredictionAlgorithm(model=model, observed=[model.X], target_variables=[model.Y]),
                              infr_params=infr.params)

In [17]:
res = infr_pred.run(X=mx.nd.array(model_data_X, dtype='float64'))[0]
f_mean, f_var = res[0].asnumpy()[0], res[1].asnumpy()[0]



In [22]:
np.abs(f_mean - model_data_Y).max()

0.00031203284975017676

In [17]:
model_data_X.shape

(796, 3)

In [31]:
model_data_X.shape

(796, 3)

In [40]:
def run_one_episode(env, policy, max_steps=None, verbose=False, render=False):
    observation = env.reset()
    step_idx = 0
    done = False
    total_reward = 0
    all_observations = [np.array(observation)]
    all_actions = []
    if render:
        frames = []
    while not done:
        if render:
            frames.append(env.render(mode = 'rgb_array'))
        action = policy(observation)
        if verbose:
            print(observation, action)
            res = infr_pred.run(X=mx.nd.array(np.hstack([observation, action])[None,:], dtype='float64'))[0]
            f_mean, f_var = res[0].asnumpy()[0], res[1].asnumpy()[0]
            print(f_mean, f_var)
            
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or (max_steps is not None and step_idx>=max_steps-1):
            print("Episode finished after {} timesteps".format(step_idx+1))
            break
    if render:
        anim = display_frames_as_gif(frames)
        return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64), anim
    else:
        return total_reward, np.array(all_observations, dtype=np.float64), np.array(all_actions, dtype=np.float64)


In [41]:
policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
run_one_episode(env, policy_func, max_steps=500, render=False, verbose=True)

[-0.55135563  0.        ] [-0.51190813]
[[-0.55191995 -0.00056136]] [3.22222293e-09]
[-0.55191556 -0.00055993] [-0.51195038]
[[-0.55303615 -0.00111745]] [3.18390203e-09]




[-0.55303129 -0.00111573] [-0.51194175]
[[-0.55470021 -0.00166541]] [3.15457305e-09]
[-0.55469447 -0.00166319] [-0.51188229]
[[-0.55689963 -0.00220107]] [3.13443049e-09]
[-0.55689261 -0.00219813] [-0.51177243]
[[-0.55961781 -0.00272036]] [3.12328208e-09]
[-0.55960911 -0.0027165 ] [-0.51161298]
[[-0.56283423 -0.00321933]] [3.12058024e-09]
[-0.56282347 -0.00321437] [-0.5114051]
[[-0.56652459 -0.00369421]] [3.12548298e-09]
[-0.56651144 -0.00368797] [-0.51115032]
[[-0.57066104 -0.00414141]] [3.13692494e-09]
[-0.57064518 -0.00413374] [-0.51085051]
[[-0.57521236 -0.00455758]] [3.15369597e-09]
[-0.57519351 -0.00454834] [-0.51050789]
[[-0.58014425 -0.0049396 ]] [3.17452287e-09]
[-0.58012219 -0.00492868] [-0.51012498]
[[-0.58541959 -0.00528465]] [3.19813864e-09]
[-0.58539416 -0.00527197] [-0.50970462]
[[-0.59099878 -0.00559019]] [3.22334159e-09]
[-0.59096988 -0.00557571] [-0.5092499]
[[-0.59684002 -0.00585403]] [3.24903571e-09]
[-0.59680761 -0.00583774] [-0.50876418]
[[-0.60289971 -0.00607429]]

[-0.69349316 -0.00350473] [-0.49954865]
[[-0.69656836 -0.00305436]] [3.33627215e-09]
[-0.69652746 -0.0030343 ] [-0.49919059]
[[-0.69910988 -0.00256256]] [3.33275896e-09]
[-0.69907098 -0.00254352] [-0.49887405]
[[-0.70114382 -0.00205387]] [3.33361072e-09]
[-0.70110672 -0.00203574] [-0.4986012]
[[-0.70265669 -0.00153175]] [3.33963612e-09]
[-0.70262108 -0.00151437] [-0.49837391]
[[-0.70363844 -0.00099971]] [3.35147066e-09]
[-0.70360397 -0.00098288] [-0.49819375]
[[-7.04082526e-01 -4.61286739e-04]] [3.36954153e-09]
[-7.04048763e-01 -4.44798394e-04] [-0.49806195]
[[-7.03985917e-01  7.99679398e-05]] [3.3940406e-09]
[-7.03952421e-01  9.63421736e-05] [-0.49797942]
[[-7.03349125e-01  6.20492994e-04]] [3.42490791e-09]
[-7.03315434e-01  6.36987243e-04] [-0.49794671]
[[-0.70217619  0.00115674]] [3.46182771e-09]
[-0.70214185  0.00117358] [-0.49796408]
[[-0.70047467  0.00168519]] [3.50423912e-09]
[-0.70043926  0.00170259] [-0.49803141]
[[-0.69825562  0.00220235]] [3.55135654e-09]
[-0.69821875  0.002

[[-0.64562218  0.00630842]] [4.08982803e-09]
[-0.64556804  0.00633513] [-0.50228643]
[[-0.6391455   0.00644977]] [4.11043288e-09]
[-0.63909185  0.00647619] [-0.50285512]
[[-0.6325736   0.00654501]] [4.12494572e-09]
[-0.63252097  0.00657088] [-0.5034398]
[[-0.62595387  0.00659314]] [4.13337942e-09]
[-0.62590279  0.00661818] [-0.50403628]
[[-0.61933441  0.00659345]] [4.13575352e-09]
[-0.61928537  0.00661741] [-0.50464026]
[[-0.61276363  0.00654562]] [4.13205514e-09]
[-0.61271709  0.00656829] [-0.50524736]
[[-0.60628989  0.0064497 ]] [4.1222088e-09]
[-0.60624622  0.00647087] [-0.50585316]
[[-0.59996108  0.00630609]] [4.1060737e-09]
[-0.5999206   0.00632561] [-0.50645323]
[[-0.59382429  0.0061156 ]] [4.08345491e-09]
[-0.59378724  0.00613336] [-0.50704318]
[[-0.58792538  0.00587941]] [4.05414768e-09]
[-0.58789191  0.00589533] [-0.50761866]
[[-0.58230861  0.00559907]] [4.01798772e-09]
[-0.5822788   0.00561311] [-0.50817543]
[[-0.57701627  0.0052765 ]] [3.97492173e-09]
[-0.57699012  0.0052886

[[-0.69949708 -0.00297394]] [3.39946338e-09]
[-0.69945833 -0.00295495] [-0.49890786]
[[-0.70193967 -0.00246263]] [3.3977523e-09]
[-0.70190303 -0.0024447 ] [-0.498597]
[[-0.70385601 -0.00193518]] [3.4007126e-09]
[-0.70382121 -0.00191818] [-0.4983317]
[[-0.70523342 -0.00139516]] [3.40913564e-09]
[-0.70520013 -0.00137891] [-0.49811377]
[[-0.7060628  -0.00084618]] [3.42362094e-09]
[-0.70603058 -0.00083046] [-0.49794471]
[[-7.06338601e-01 -2.91856362e-04]] [3.44453888e-09]
[-7.06307005e-01 -2.76423711e-04] [-0.49782567]
[[-7.06058916e-01  2.64165003e-04]] [3.47200579e-09]
[-7.06027448e-01  2.79556688e-04] [-0.49775747]
[[-0.70522544  0.00081825]] [3.50587381e-09]
[-0.7051936   0.00083385] [-0.49774057]
[[-0.70384347  0.00136677]] [3.54572682e-09]
[-0.70381078  0.00138282] [-0.49777511]
[[-0.7019219   0.00190613]] [3.59090047e-09]
[-0.70188791  0.00192286] [-0.49786088]
[[-0.69947321  0.00243278]] [3.64050079e-09]
[-0.69943752  0.00245039] [-0.49799729]
[[-0.69651335  0.00294323]] [3.6934486

[[-0.58113956 -0.00546357]] [3.24481153e-09]
[-0.58111729 -0.00545243] [-0.51012192]
[[-0.58693163 -0.00580143]] [3.27471206e-09]
[-0.58690566 -0.00578836] [-0.50965355]
[[-0.59301629 -0.0060959 ]] [3.30559136e-09]
[-0.59298654 -0.00608089] [-0.50915114]
[[-0.599348  -0.0063449]] [3.33635386e-09]
[-0.5993145  -0.00632796] [-0.50861839]
[[-0.60587954 -0.0065467 ]] [3.36607053e-09]
[-0.60584239 -0.00652789] [-0.50805925]
[[-0.61256239 -0.00669996]] [3.3939731e-09]
[-0.61252178 -0.00667939] [-0.50747784]
[[-0.61934714 -0.00680374]] [3.41944029e-09]
[-0.61930334 -0.00678156] [-0.50687844]
[[-0.62618389 -0.00685751]] [3.4419787e-09]
[-0.62613726 -0.00683392] [-0.50626547]
[[-0.63302264 -0.00686113]] [3.4612051e-09]
[-0.63297361 -0.00683635] [-0.50564341]
[[-0.6398137  -0.00681485]] [3.47683571e-09]
[-0.63976276 -0.00678915] [-0.50501682]
[[-0.64650804 -0.00671932]] [3.48868401e-09]
[-0.64645572 -0.00669297] [-0.50439028]
[[-0.65305769 -0.00657555]] [3.49666784e-09]
[-0.65300457 -0.00654885]

(-12.717820205247302, array([[-5.51355630e-01,  0.00000000e+00],
        [-5.51915557e-01, -5.59926290e-04],
        [-5.53031288e-01, -1.11573134e-03],
        [-5.54694475e-01, -1.66318684e-03],
        [-5.56892605e-01, -2.19813065e-03],
        [-5.59609106e-01, -2.71650055e-03],
        [-5.62823474e-01, -3.21436757e-03],
        [-5.66511442e-01, -3.68796846e-03],
        [-5.70645179e-01, -4.13373722e-03],
        [-5.75193514e-01, -4.54833527e-03],
        [-5.80122194e-01, -4.92867986e-03],
        [-5.85394165e-01, -5.27197038e-03],
        [-5.90969877e-01, -5.57571206e-03],
        [-5.96807614e-01, -5.83773674e-03],
        [-6.02863834e-01, -6.05622020e-03],
        [-6.09093529e-01, -6.22969574e-03],
        [-6.15450593e-01, -6.35706371e-03],
        [-6.21888190e-01, -6.43759686e-03],
        [-6.28359131e-01, -6.47094127e-03],
        [-6.34816244e-01, -6.45711304e-03],
        [-6.41212735e-01, -6.39649075e-03],
        [-6.47502539e-01, -6.28980410e-03],
        [-6

In [22]:
policy.collect_params().save('./pendulum_policy.params')

In [18]:
%pwd

'/home/ubuntu/MXFusion/examples/notebooks'

1. update the optimize policy fuction.
2. multiple initial states.
3. use the real reward function. (https://github.com/openai/gym/blob/master/gym/envs/classic_control/continuous_mountain_car.py)
4. visualize the intermediate and final performance of policy.
5. Make a notebook to show.
6. Try a non-linear policy.

In [40]:
HTML(anim.to_jshtml())

In [16]:
from pyglet.gl import *

NameError: name 'base' is not defined