- main (global policy and value networks)
    - Create and coordinate workers 
    - check CPUs available, create threads and workers
    - Initialize global thread safe counter,every workers know when to quit
    
- worker (contains local policy and value networks)
    - copy weights from global network
    - play episodes
    - send gradients back to master

\begin{equation*}
g_{local} = \frac {\partial L(\theta_{local})}{\partial \theta_{local}} \\
\theta_{global} = \theta_{global} - \eta g_{local}\\
\end{equation*}
    
- nets
    - definition of poilicy and value network
    - variable scopes
    - "reuse" arguments

In [5]:
import gym
import sys
import os
import numpy as np

import tensorflow as tf

### Create Policy,Value Network

In [2]:
def build_feature_extractor(input_):
    # We only want to create the weights once
    # In all future calls we should set reuse = True

    # scale the inputs from 0..255 to 0..1
    input_ = tf.to_float(input_) / 255.0

    # conv layers
    conv1 = tf.contrib.layers.conv2d(
      input_,
      16, # num output feature maps
      8,  # kernel size
      4,  # stride
      activation_fn=tf.nn.relu,
      scope="conv1")
    conv2 = tf.contrib.layers.conv2d(
      conv1,
      32, # num output feature maps
      4,  # kernel size
      2,  # stride
      activation_fn=tf.nn.relu,
      scope="conv2")

    # image -> feature vector
    flat = tf.contrib.layers.flatten(conv2)

    # dense layer
    fc1 = tf.contrib.layers.fully_connected(
      inputs=flat,
      num_outputs=256,
      scope="fc1")

    return fc1


In [3]:
class PolicyNetwork:
    def __init__(self, num_outputs, reg=0.01):
        
        self.num_outputs = num_outputs
        # states = N x 28224(84*84*4)
        self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        
        # Advantage = G - V(s)
        self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        
        # Selected actions
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
        
        # Since we set reuse=False here, that means we MUST
        # create the PolicyNetwork before creating the ValueNetwork
        # ValueNetwork will use reuse=True
        with tf.variable_scope("shared", reuse=False):
            fc1 = build_feature_extractor(self.states)
            self.debug_fc1 = fc1
        # Use a separate scope for output and loss
        with tf.variable_scope("policy_network"):
            self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None)
            self.probs = tf.nn.softmax(self.logits)
            
            # Sample an action
            cdist = tf.distributions.Categorical(logits=self.logits)
            self.sample_action = cdist.sample()
            
            # Add regularization to increase exploration
            self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), axis=1)
            
            # Get the predictions for the chosen actions only
            batch_size = tf.shape(self.states)[0]
            gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
            self.selected_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)
            
            self.loss = tf.log(self.selected_action_probs) * self.advantage + reg * self.entropy
            self.loss = -tf.reduce_sum(self.loss, name="loss")
            
            # training
            # tf.train.RMSPropOptimizer(learning_rate,decay,momentum,epsilon,...)
            self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
            # tf.train.AdamOptimizer(learning_rate,beta1,beta2,epsilon,...)
            # self.optimizer = tf.train.AdamOptimizer(0.00025)
            
            # we'll need these later for running gradient descent steps
            self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
            self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]


            
class ValueNetwork:
    def __init__(self):
        
        # Placeholders for our input
        # After resizing we have 4 consecutive frames of size 84 x 84
        self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        # The TD target value, reward value
        self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")

        # Since we set reuse=True here, that means we MUST
        # create the PolicyNetwork before creating the ValueNetwork
        # PolictyNetwork will use reuse=False
        with tf.variable_scope("shared", reuse=True):
            fc1 = build_feature_extractor(self.states)
            
        # Use a separate scope for output and loss
        with tf.variable_scope("value_network"):
            self.vhat = tf.contrib.layers.fully_connected(
                inputs=fc1,
                num_outputs=1,
                activation_fn=None)
            self.debug_vhat = self.vhat
            self.vhat = tf.squeeze(self.vhat, squeeze_dims=[1], name="vhat")
            
            # tf.losses.mean_squared_error( rewards - valuenetwork's output)
            self.loss = tf.squared_difference(self.vhat, self.targets)
            self.loss = tf.reduce_sum(self.loss, name="loss")
            
            # training
            self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
            # self.optimizer = tf.train.AdamOptimizer(0.00025)

            # we'll need these later for running gradient descent steps
            self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
            self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
            
            

            

In [4]:
# Should use this to create networks
# to ensure they're created in the correct order
def create_networks(num_outputs):
    policy_network = PolicyNetwork(num_outputs=num_outputs)
    value_network = ValueNetwork()
    return policy_network, value_network


### Worker

In [6]:
class Step:
    def __init__(self, state, action, reward, next_state, done):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state
        self.done = done

In [None]:
# Transform raw images for input into neural network
# 1) Convert to grayscale
# 2) Resize
# 3) Crop
class ImageTransformer:
    def __init__(self):
        with tf.variable_scope("image_transformer"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output,
                [84, 84],
                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def transform(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.output, { self.input_state: state })


In [7]:
# Create initial state by repeating the same frame 4 times
# generating (84,84,4)
def repeat_frame(frame):
    return np.stack([frame] * 4, axis=2)



In [None]:
# Create next state by shifting each frame by 1
# Throw out the oldest frame
# And concatenate the newest frame
# 84 * 84 * ( 3(old) + 1(next_frame))
def shift_frames(state, next_frame):
    return np.append(state[:,:,1:], np.expand_dims(next_frame, 2), axis=2)


In [8]:
# Make a Tensorflow op to copy weights from one scope to another
def get_copy_params_op(src_vars, dst_vars):
    src_vars = list(sorted(src_vars, key=lambda v: v.name))
    dst_vars = list(sorted(dst_vars, key=lambda v: v.name))

    ops = []
    for s, d in zip(src_vars, dst_vars):
        op = d.assign(s)
        ops.append(op)

    return ops

def make_train_op(local_net, global_net):
    """
    Use gradients from local network to update the global network
    """

    # Idea:
    # We want a list of gradients and corresponding variables
    # e.g. [[g1, g2, g3], [v1, v2, v3]]
    # Since that's what the optimizer expects.
    # But we would like the gradients to come from the local network
    # And the variables to come from the global network
    # So we want to make a list like this:
    # [[local_g1, local_g2, local_g3], [global_v1, global_v2, global_v3]]

    # First get only the gradients
    local_grads, _ = zip(*local_net.grads_and_vars)
    
    # Clip gradients to avoid large values
    local_grads, _ = tf.clip_by_global_norm(local_grads, 5.0)
    
    # Get global vars
    _, global_vars = zip(*global_net.grads_and_vars)
    
    # Combine local grads and global vars
    local_grads_global_vars = list(zip(local_grads, global_vars))
    
    # Run a gradient descent step, e.g.
    # var = var - learning_rate * grad
    return global_net.optimizer.apply_gradients(
            local_grads_global_vars,
            global_step=tf.train.get_global_step())

In [None]:
# Worker object to be run in a thread
# name (String) should be unique for each thread
# env (OpenAI Gym Environment) should be unique for each thread
# policy_net (PolicyNetwork) should be a global passed to every worker
# value_net (ValueNetwork) should be a global passed to every worker
# returns_list (List) should be a global passed to every worker
class Worker:
    def __init__(self,name,env,policy_net,value_net,global_counter,returns_list
                 ,discount_factor=0.99,max_global_steps=None):

        self.name = name
        self.env = env
        self.global_policy_net = policy_net
        self.global_value_net = value_net
        self.global_counter = global_counter
        self.discount_factor = discount_factor
        self.max_global_steps = max_global_steps
        self.global_step = tf.train.get_global_step()
        self.img_transformer = ImageTransformer()
        
        # Create local policy and value networks that belong only to this worker
        with tf.variable_scope(name):
            # self.policy_net = PolicyNetwork(num_outputs=policy_net.num_outputs)
            # self.value_net = ValueNetwork()
            self.policy_net, self.value_net = create_networks(policy_net.num_outputs)
            
        # We will use this op to copy the global network weights
        # back to the local policy and value networks
        self.copy_params_op = get_copy_params_op(
                  tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="global"),
                  tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name+'/'))
        
        # These will take the gradients from the local networks
        # and use those gradients to update the global network
        self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
        self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)

        self.state = None # Keep track of the current state
        self.total_reward = 0. # After each episode print the total (sum of) reward
        self.returns_list = returns_list # Global returns list to plot later
        
    def run(self, sess, coord, t_max):
        with sess.as_default(), sess.graph.as_default():
            # Assign the initial state
            self.state = repeat_frame(self.img_transformer.transform(self.env.reset()))

            try:
                while not coord.should_stop():
                    # Copy weights from  global networks to local networks
                    sess.run(self.copy_params_op)

                    # Collect some experience
                    steps, global_step = self.run_n_steps(t_max, sess)

                    # Stop once the max number of global steps has been reached
                    if self.max_global_steps is not None and global_step >= self.max_global_steps:
                        coord.request_stop()
                        return

                    # Update the global networks using local gradients
                    self.update(steps, sess)

            except tf.errors.CancelledError:
                return
    
    def sample_action(self, state, sess):
        # Make input N x D (N = 1)
        feed_dict = { self.policy_net.states: [state] }
        actions = sess.run(self.policy_net.sample_action, feed_dict)
        # Prediction is a 1-D array of length N, just want the first value
        return actions[0]
    
    def get_value_prediction(self, state, sess):
        # Make input N x D (N = 1)
        feed_dict = { self.value_net.states: [state] }
        vhat = sess.run(self.value_net.vhat, feed_dict)
        # Prediction is a 1-D array of length N, just want the first value
        return vhat[0]
    
    def run_n_steps(self, n, sess):
        pass
    
    
    def update(self, steps, sess):
        pass

### Scratch

In [4]:
sess = tf.InteractiveSession()

In [5]:
pnet = PolicyNetwork(4)
pnet.debug_fc1

<tf.Tensor 'shared/fc1/Relu:0' shape=(?, 256) dtype=float32>

In [14]:
pnet.logits,pnet.probs,pnet.sample_action,pnet.actions,pnet.entropy,pnet.selected_action_probs

(<tf.Tensor 'policy_network/fully_connected/BiasAdd:0' shape=(?, 4) dtype=float32>,
 <tf.Tensor 'policy_network/Softmax:0' shape=(?, 4) dtype=float32>,
 <tf.Tensor 'policy_network/Categorical/sample/Reshape_1:0' shape=(?,) dtype=int32>,
 <tf.Tensor 'actions:0' shape=(?,) dtype=int32>,
 <tf.Tensor 'policy_network/Neg:0' shape=(?,) dtype=float32>,
 <tf.Tensor 'policy_network/GatherV2:0' shape=(?,) dtype=float32>)

In [18]:
vnet = ValueNetwork()
vnet.debug_vhat,vnet.vhat

Instructions for updating:
Use the `axis` argument instead


(<tf.Tensor 'value_network/fully_connected/BiasAdd:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'value_network/vhat:0' shape=(?,) dtype=float32>)