In [None]:
# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tnrange, tqdm_notebook
import numpy as np
from collections import deque
from q_learning import plot_running_avg, FeatureTransformer

%matplotlib inline

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))

Now we create the environment (CartPole V0)

In [None]:
import gym
env = gym.make("MountainCarContinuous-v0").env

Then we create a layer of a network

In [None]:
import tensorflow as tf

class HiddenLayer:
    def __init__(self, M1, M2, f=tf.nn.tanh, use_bias=True, zeros=False):
        if zeros:
            W = np.zeros((M1, M2)).astype(np.float32)
            self.W = tf.Variable(W)
        else:
            self.W = tf.Variable(tf.random_normal(shape=(M1, M2)))
            
        self.params = [self.W]
        
        self.use_bias = use_bias
        
        if use_bias:
            self.b = tf.Variable(np.zeros(M2).astype(np.float32))
            self.params.append(self.b)
        self.f = f

    def forward(self, X):
        if self.use_bias:
            a = tf.matmul(X, self.W) + self.b
        else:
            a = tf.matmul(X, self.W)
        return self.f(a)

Next thing we create the policy model. This time we will have 2 outputs, the mean and the var of the distribution

In [None]:
class PolicyModel:
    def __init__(self, ft, D, hidden_layer_sizes=[]):

        # save inputs for copy
        self.ft = ft
        self.D = D
        self.hidden_layer_sizes = hidden_layer_sizes
        
        ##### model the mean #####
        self.hidden_layers = []
        M1 = D
        for M2 in hidden_layer_sizes:
            layer = HiddenLayer(M1, M2)
            self.hidden_layers.append(layer)
            M1 = M2

        # final layer
        self.mean_layer = HiddenLayer(M1, 1, lambda x: x, use_bias=False, zeros=True)

        # final layer
        self.stdv_layer = HiddenLayer(M1, 1, tf.nn.softplus, use_bias=False, zeros=False)

        # gather params
        self.params = []
        for layer in (self.mean_layers + self.var_layers):
            self.params += layer.params

        # inputs and targets
        self.X = tf.placeholder(tf.float32, shape=(None, D), name='X')
        self.actions = tf.placeholder(tf.float32, shape=(None,), name='actions')
        self.advantages = tf.placeholder(tf.float32, shape=(None,), name='advantages')

        Z = self.X
        for layer in self.hidden_layers:
            Z = layer.forward(Z)
            
        mean = self.mean_layer.forward(Z)
        stdv = self.var_layer.forward(Z)
        
        mean = tf.reshape(mean, [-1])
        stdv = tf.reshape(stdv, [-1])

        norm = tf.contrib.distributions.Normal(mean, stdv)
        self.predict_op = tf.clip_by_value(norm.sample(), -1, 1)

        log_probs = norm.log_prob(self.actions)
        cost = -tf.reduce_sum(self.advantages * log_probs + 0.1 * norm.entropy())
        self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)

    def set_session(self, session):
        self.session = session
        
    def partial_fit(self, X, actions, advantages):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        
        actions = np.atleast_1d(actions)
        advantages = np.atleast_1d(advantages)
        self.session.run(self.train_op, feed_dict={self.X: X, self.actions: actions, self.advantages: advantages})

    def predict(self, X):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        return self.session.run(self.predict_op, feed_dict={self.X: X})

    def sample_action(self, X):
        p = self.predict(X)[0]
        # print("action:", p)
        return p


Given that we are not using Hill Climb anymore, we also need to model the Value function

In [None]:
class ValueModel:
    def __init__(self, D, ft, hidden_layer_sizes=[]):
        self.ft = ft
        self.costs = []
        ##### Create layers #####
        self.layers = []
        M1 = D
        for M2 in hidden_layer_sizes:
            layer = HiddenLayer(M1, M2)
            self.layers.append(layer)
            M1 = M2
        # final layer
        layer = HiddenLayer(M1, 1, lambda x: x)
        self.layers.append(layer)
        # Declare placeholders
        self.X = tf.placeholder(tf.float32, shape=(None, D), name='X')
        self.Y = tf.placeholder(tf.float32, shape=(None, ), name='Y')
        # Get the output
        Z = self.X
        for layer in self.hidden_layers:
            Z = layer.forward(Z)
        Y_hat = tf.reshape(Z, [-1])
        self.predict_op = Y_hat
        
        cost = tf.reduce_sum(tf.square(self.Y - Y_hat))
        
        

Next we will create the play_one function

In [None]:
def play_one(env, pmodel, gamma):
    observation = env.reset()
    done = False
    totalreward = 0
    iters = 0

    while not done and iters < 2000:
        # if we reach 2000, just quit, don't want this going forever
        # the 200 limit seems a bit early
        action = pmodel.sample_action(observation)
        # oddly, the mountain car environment requires the action to be in
        # an object where the actual action is stored in object[0]
        observation, reward, done, info = env.step([action])

        totalreward += reward
        iters += 1
    return totalreward


def play_multiple_episodes(env, T, pmodel, gamma, print_iters=False, status=False):
    totalrewards = np.empty(T)
    r = range(T)
    if status:
        r = tqdm_notebook(range(T), desc='Episodes'):
    for i in range(T):
        totalrewards[i] = play_one(env, pmodel, gamma)
        if print_iters:
            print(i, "avg so far:", totalrewards[:(i+1)].mean())

    avg_totalrewards = totalrewards.mean()
    print("avg totalrewards:", avg_totalrewards)
    return avg_totalrewards

We create the random search function to optimize parameters