In [None]:
import os
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
from mxfusion.common import config
config.DEFAULT_DTYPE = 'float64'

In [None]:
import gym
env = gym.make('Pendulum-v0')

In [None]:
action_dim = 1
env.action_space

In [None]:
obs_dim = 3
env.observation_space

In [None]:
import numpy as np

def run_one_episode(env, policy, initial_state=None, max_steps=10000, verbose=False, render=False):
    observation = env.reset()
    if initial_state is not None:
        env.env.state = initial_state
        observation = env.env._get_obs()
    env._max_episode_steps = max_steps
    step_idx = 0
    done = False
    total_reward = 0
    all_actions = []
    all_observations = [observation]
    while not done:
        if render:
            env.render()
        if verbose:
            print(observation)
        action = policy(observation)
        observation, reward, done, info = env.step(action)
        all_observations.append(observation)
        all_actions.append(action)
        total_reward += reward
        step_idx += 1
        if done or step_idx>=max_steps-1:
            print("Episode finished after {} timesteps because {}".format(step_idx+1, "'done' reached" if done else "Max timesteps reached"))
            break
    return total_reward, np.array(all_observations, dtype=np.float64,), np.array(all_actions, dtype=np.float64)
#     return total_reward, all_observations, all_actions

In [None]:
env.action_space.sample().shape

In [None]:
def random_policy(state):
    return env.action_space.sample()

def spin_policy(state):
    return np.array(np.random.uniform(low=-2, high=0.)).reshape((1,))

In [None]:
def prepare_data(state_list, action_list, win_in):
    
    X_list = []
    Y_list = []
    
    for state_array, action_array in zip(state_list, action_list):
        # the state and action array shape should be aligned.
        assert state_array.shape[0]-1 == action_array.shape[0]
        
        for i in range(state_array.shape[0]-win_in):
            Y_list.append(state_array[i+win_in:i+win_in+1])
            X_list.append(np.hstack([state_array[i:i+win_in].flatten(), action_array[i:i+win_in].flatten()]))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

In [None]:
from mxfusion import Model, Variable
from mxfusion.components.variables import PositiveTransformation
from mxfusion.components.distributions.gp.kernels import RBF
from mxfusion.modules.gp_modules import SparseGPRegression, SparseGPRegressionSamplingPrediction
from mxfusion.modules.gp_modules import GPRegression, GPRegressionSamplingPrediction
import mxnet as mx
from mxfusion.inference import GradBasedInference, MAP

    
def fit_model_synthetic(input_list, output_list, win_in, verbose=True):
#     X, Y = prepare_data(state_list, action_list, win_in)
    
#     Y_mean = Y.mean()
#     Y_std = Y.std()
#     Y = (Y-Y_mean)/(Y_std + 1e-10)
    X = input_list # [(state, action), ...]
    Y = output_list # [next state]

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = GPRegression.define_variable(X=m.X, kernel=m.kernel, noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    m.Y.factor.gp_log_pdf.jitter = 1e-6
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=GPRegressionSamplingPrediction(
                    gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]), 
                alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=100, learning_rate=0.5, verbose=verbose)
    return m, infr, X, Y #, Y_mean, Y_std
    
def fit_model(state_list, action_list, win_in, verbose=False):
    X, Y = prepare_data(state_list, action_list, win_in)
    
#     Y_mean = Y.mean()
#     Y_std = Y.std()
#     Y = (Y-Y_mean)/(Y_std + 1e-10)

    m = Model()
    m.N = Variable()
    m.X = Variable(shape=(m.N, X.shape[-1]))
    m.noise_var = Variable(shape=(1,), transformation=PositiveTransformation(), initial_value=0.01)
    m.kernel = RBF(input_dim=X.shape[-1], variance=1, lengthscale=1, ARD=True)
    m.Y = GPRegression.define_variable(X=m.X, kernel=m.kernel, noise_var=m.noise_var, shape=(m.N, Y.shape[-1]))
    m.Y.factor.gp_log_pdf.jitter = 1e-6
    
    gp = m.Y.factor
    gp.attach_prediction_algorithms(targets=gp.output_names, conditionals=gp.input_names,
                algorithm=GPRegressionSamplingPrediction(
                    gp._module_graph, gp._extra_graphs[0], [gp._module_graph.X]), 
                alg_name='gp_predict')

    infr = GradBasedInference(inference_algorithm=MAP(model=m, observed=[m.X, m.Y]))
    infr.run(X=mx.nd.array(X, dtype='float64'), Y=mx.nd.array(Y, dtype='float64'),
             max_iter=100, learning_rate=0.5, verbose=verbose)
    return m, infr, X, Y #, Y_mean, Y_std

In [None]:
all_states = []
all_actions = []

In [None]:
total_reward, states, actions = run_one_episode(env, spin_policy, max_steps=200, render=True)
all_states.append(states)
all_actions.append(actions)

In [None]:
states.shape

In [None]:
actions.shape

In [None]:
model, infr, model_train_X, model_train_Y = fit_model(all_states, all_actions, win_in=1)

In [None]:
from mxfusion.inference import TransferInference, PILCOAlgorithm, BatchInferenceLoop
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def run_policy(policy, cost_func, model, infr, model_data_X, model_data_Y,
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, num_time_steps=100, 
                    num_samples=10, verbose=True):
    
    from mxfusion.inference import GradTransferInference, PILCOAlgorithm, BatchInferenceLoop
    from mxfusion.inference.pilco_alg import PolicyUpdateGPParametricApprox
    mb_alg = PolicyUpdateGPParametricApprox(model=model, 
                                 observed=[model.X, model.Y], 
                                 cost_function=cost_func, 
                                 policy=policy,
                                 n_time_steps=num_time_steps,
                                 initial_state_generator=initial_state_generator,
                                 num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = TransferInference(mb_alg, 
                                  infr_params=infr.params)
    rewards = infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(model_data_X, dtype='float64'),
                  Y=mx.nd.array(model_data_Y, dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate)
    return rewards

In [None]:
from mxfusion.inference import GradTransferInference, PILCOAlgorithm, BatchInferenceLoop, TransferInference
from mxnet.gluon import Block
from mxnet.gluon.parameter import ParameterDict

def optimize_policy(policy, cost_func, model, infr, model_data_X, model_data_Y,
                    initial_state_generator, num_grad_steps,
                    learning_rate=1e-2, momentum=0, num_time_steps=100, 
                    num_samples=10, verbose=True):
    
    from mxfusion.inference.pilco_alg import PolicyUpdateGPParametricApprox
    mb_alg = PolicyUpdateGPParametricApprox(model=model, 
                                 observed=[model.X, model.Y], 
                                 cost_function=cost_func, 
                                 policy=policy, 
                                 n_time_steps=num_time_steps,
                                 initial_state_generator=initial_state_generator,
                                 num_samples=num_samples)
    
    train_params = policy.collect_params() if isinstance(policy, Block) else ParameterDict()
    infr_pred = GradTransferInference(mb_alg, 
                                  infr_params=infr.params, train_params=train_params)
    infr_pred.run(max_iter=num_grad_steps, 
                  X=mx.nd.array(model_data_X, dtype='float64'),
                  Y=mx.nd.array(model_data_Y, dtype='float64'),
                  verbose=verbose,
                  learning_rate=learning_rate, momentum=momentum)
    return policy

In [None]:
class MountainCarCostFunction(mx.gluon.HybridBlock):
    def hybrid_forward(self, F, state, action):
        return F.sum(10*(state[:,:,0:1] - 0.45)**2, axis=-1)


def angle_normalize(x):
    return (((x+np.pi) % (2*np.pi)) - np.pi)

class TruePendulumCostFunction(mx.gluon.HybridBlock):
    """
    Taken from the code. True except for the action penalty
    """
    def hybrid_forward(self, F, state, action):
        cos_th, thdot = state[:,:,0], state[:,:,2]
        th = mx.nd.arccos(cos_th)
        tmp_a = mx.nd.reshape(action, shape=th.shape)
        return  - (angle_normalize(th)**2 + .1*thdot**2 + .001*(tmp_a**2))

    
class SimplePendulumCostFunction(mx.gluon.HybridBlock):
    """
    Taken from the code.
    """
    def hybrid_forward(self, F, state, action):
        a_scale = 2. # 2. ~ theta **2
        b_scale = .001 # 0.001
        c_scale = .1  # 0.1
        a = F.sum(a_scale * (state[:,:,0:1] -1) ** 2, axis=-1)
        b = F.sum(b_scale * action ** 2, axis=-1)
        c = F.sum(c_scale * state[:,:,2:3] ** 2, axis=-1)
#         print(F.sum(a),F.sum(b),F.sum(c))
        return (a + c + b)
    
cost = SimplePendulumCostFunction()

In [None]:
bounds = []
for i in range(states.shape[1]):
    axis = states[:,i]
    bounds.append((np.min(axis), np.max(axis)))

In [None]:
num_samples = 20

In [None]:
# class InitialStateGenerator:
#     def __init__(self, bounds, dtype='float64'):
#         self.bounds = bounds
#         self.dtype = dtype
#     def __call__(self, num_initial_states):
#         states = None
#         for mini, maxi in self.bounds:
#             b = mx.nd.random.uniform(low=mini, high=maxi, shape=(num_initial_states, 1), dtype=self.dtype)
#             if states is None:
#                 states = b
#             else:
#                 states = mx.nd.concat(states, b)
#         return states

In [None]:
def initial_state_generator(num_initial_states):
    theta = mx.nd.random.uniform(low=0., high=2*np.pi, shape=(num_initial_states, 1), dtype='float64')
    thdot = mx.nd.random.uniform(low=-8, high=8, shape=(num_initial_states, 1), dtype='float64')
    return mx.nd.concat(mx.nd.cos(theta), mx.nd.sin(theta),thdot)

In [None]:
# initial_state_generator = InitialStateGenerator([(-1,1),(-1,1),(-2,2)])

In [None]:
initial_state_generator(num_samples)

# Full PILCO

In [None]:
env.observation_space.sample()

In [None]:
from mxnet.gluon.nn import HybridSequential
from mxnet.gluon import HybridBlock
# Initial step

class MultiplyByTwo(HybridBlock):
    def hybrid_forward(self, F, X):
        return X * 2


class EpsilonGreed(HybridBlock):
    def __init__(self, epsilon, bounds, **kwargs):
        super(EpsilonGreed, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.bounds = bounds
    
    def hybrid_forward(self, F, X):
        p = F.random.uniform()
        if p >= self.epsilon:
            return X
        else:
            a = F.random.uniform(low=self.bounds[0],
                                 high=self.bounds[1],
                                 shape=X.shape,
                                 dtype=X.dtype)
            return a
    
epsilon = 0.2
dense_units = 50
policy = HybridSequential()
policy.add(mx.gluon.nn.Dense(dense_units, in_units=obs_dim, dtype='float64', activation='relu'))
policy.add(mx.gluon.nn.Dense(1, in_units=dense_units, dtype='float64', activation='tanh'))
policy.add(MultiplyByTwo())
policy.add(EpsilonGreed(epsilon, (-2,2)))

# policy = mx.gluon.nn.Dense(1, in_units=2, dtype='float64')
policy.collect_params().initialize(mx.init.Xavier(magnitude=3.))

In [None]:
policy

In [None]:
policy.collect_params()

In [None]:
def sample_and_set_policy_parameters(policy):
    for n, p in policy.collect_params().items():
        param = mx.nd.random.uniform(low=-5, high=5, shape=p.shape, dtype='float64')
        p.set_data(param)

In [None]:
INITIAL_GYM_STATE = [np.pi, 1.]
INITIAL_STATE = [np.cos(INITIAL_GYM_STATE[0]), np.sin(INITIAL_GYM_STATE[0]), INITIAL_GYM_STATE[1]]

In [None]:
def static_state_generator(num_samples):
    return mx.nd.array([INITIAL_STATE] * num_samples, dtype='float64')

In [None]:
static_state_generator(1).shape

### Test Model fit on manually generated state,action pairs

In [None]:
def evaluate_pair(env, initial_state, action):
    env.env.state = initial_state
    us_initial = [np.cos(initial_state[0]), np.sin(initial_state[0]), initial_state[1]]
    observation, reward, done, info = env.step(action)
    return (us_initial, action, observation, reward)

In [None]:
def sample_state_action_pair():
    state = mx.nd.random.uniform(low=-2*np.pi, high=2*np.pi, shape=(1,2), dtype='float64')
    action = mx.nd.random.uniform(low=-2, high=2, shape=(1,), dtype='float64')
    return state.asnumpy()[0], action.asnumpy()

In [None]:
all_rewards = []
all_states = []
all_actions = []
all_obs = []
for i in range(500):
    state, action = sample_state_action_pair()
    state, action, obs, reward = evaluate_pair(env, state, action)
    all_states.append(state)
    all_actions.append(action)
    all_obs.append(obs)
    all_rewards.append(reward)

In [None]:
best_reward = np.argmax(all_rewards)
gym_state = all_states[best_reward]
gym_state = [np.arccos(gym_state[0]), gym_state[2]]
env.env.state = gym_state
env.render()

In [None]:
X = np.concatenate((np.array(all_states), np.array(all_actions)), axis=1)

In [None]:
X.shape

In [None]:
Y = np.array(all_obs)

In [None]:
Y.shape

In [None]:
model, infr, model_data_X, model_data_Y = fit_model_synthetic(X, Y, win_in=1, verbose=False)

In [None]:
from mxfusion.inference import TransferInference, ModulePredictionAlgorithm
infr_pred = TransferInference(ModulePredictionAlgorithm(model=model, observed=[model.X], target_variables=[model.Y], num_samples=100), 
                              infr_params=infr.params)

In [None]:
res = infr_pred.run(X=mx.nd.array(X, dtype='float64'))[0]
f_mean, f_var = res[0].asnumpy()[0], res[1].asnumpy()[0]

In [None]:
np.max(np.abs(Y - np.mean(res.asnumpy(), axis=0)))

### Test reward correlation from state, action pairs

In [None]:
policy_parameters = []
rewards = []
all_states = []
all_actions = []
for i in range(500):
    params = sample_and_set_policy_parameters(policy)
    policy_parameters.append(params)
    policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    true_rewards, states, actions = run_one_episode(env, random_policy, initial_state=INITIAL_GYM_STATE, max_steps=100, render=False)
    s = mx.nd.array(states[1:])
    s = mx.nd.expand_dims(s, axis=1)
    a = mx.nd.array(actions)
    a = mx.nd.expand_dims(a, axis=1)
    
    our_rewards = -mx.nd.sum(cost(s, a))
    print("Paired:",true_rewards, our_rewards)
    rewards.append(np.array([true_rewards, our_rewards.asnumpy()]))
    all_states.append(states)
    all_actions.append(actions)
rewards = np.array(rewards)

In [None]:
%matplotlib inline
from pylab import *
plt.plot(rewards[:,0], rewards[:,1], 'o')

### Test  reward correlations from full runs

In [None]:
policy_parameters = []
rewards = []
timesteps = 5
for i in range(5):
    params = sample_and_set_policy_parameters(policy)
    policy_parameters.append(params)
    policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
    true_rewards, states, actions = run_one_episode(env, policy_func,
                                                    initial_state=INITIAL_GYM_STATE,
                                                    max_steps=timesteps, render=False)
#     print("True states:", states)
    our_costs, _ = run_policy(policy, cost, model, infr, model_data_X, model_data_Y,
                    static_state_generator, 100,
                    learning_rate=1e-2, num_time_steps=timesteps, 
                    num_samples=10, verbose=True)
    our_costs = -our_costs / 10.
    print("Rewards: ", true_rewards, our_costs.asnumpy())
    rewards.append(np.array([true_rewards, our_costs.asnumpy()]))
rewards = np.array(rewards)

In [None]:
%matplotlib inline
from pylab import *
plt.plot(rewards[:,0], rewards[:,1], 'o')

In [None]:
policy[2].epsilon = 0.05

In [None]:
all_states = []
all_actions = []

num_episode = 40

num_grad_steps = 50
num_time_steps = 250
learning_rate = 1e-4
use_random_policy = False
policies = []

for i_ep in range(num_episode):
    print('Start Episode '+str(i_ep+1)+'.')
    
    # Run an episode and collect data.    
    if i_ep==0 and use_random_policy:
        print("Using random policy")
        policy_func = random_policy       
#         learning_rate = 1e-2
    else:
        print("Using learned policy")
        policy_func = lambda x: policy(mx.nd.expand_dims(mx.nd.array(x, dtype='float64'), axis=0)).asnumpy()[0]
#         learning_rate = 1e-2
    total_reward, states, actions = run_one_episode(env, policy_func, initial_state=INITIAL_GYM_STATE,
                                                    max_steps=num_time_steps, render=True)
    print("Actions:", actions[:5], actions[-5:])
    all_states.append(states)
    all_actions.append(actions)
    
    # Fit a model.
#     print('Fit the model.')
#     model, infr, model_data_X, model_data_Y, = fit_model(all_states, all_actions, win_in=1, verbose=False)
        
    # Optimize the policy.
    print('Optimize the policy.')
    policy = optimize_policy(policy, cost, model,
                             infr,
                             model_data_X, model_data_Y,
                             initial_state_generator,
                             num_grad_steps=num_grad_steps, 
                             num_samples=num_samples,
                             learning_rate=learning_rate,
                             num_time_steps=num_time_steps)
#     datum = (policy.collect_params()['dense0_weight'].data().asnumpy(),
#              policy.collect_params()['dense0_bias'].data().asnumpy())
#     policies.append(datum)

1. update the optimize policy fuction.
2. multiple initial states.
3. use the real reward function. (https://github.com/openai/gym/blob/master/gym/envs/classic_control/continuous_mountain_car.py)
4. visualize the intermediate and final performance of policy.
5. Make a notebook to show.
6. Try a non-linear policy.
7. add epsilon-greedy policy

In [None]:
parameters1 = run_one_episode(env, policy_func, max_steps=3000, render=True)

In [None]:
parameters1[2]

In [None]:
policy[0].collect_params()['dense0_weight'].set_data(mx.nd.array([[-2.26582996, -4.80595503]], dtype='float64'))
policy[0].collect_params()['dense0_bias'].set_data(mx.nd.array([1.73686036], dtype='float64'))

In [None]:
parameters1 = run_one_episode(env, policy_func, max_steps=10000, render=True)