In [13]:
%matplotlib inline

import gym
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf
import math
if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

In [14]:
env = gym.envs.make("CartPole-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [15]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

In [16]:
class Estimator():
    """
    Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """

    def __init__(self, sess ,state_dim, action_space, scope="estimator", summaries_dir='./', deulling=False):
        """
        Builds the Tensorflow graph.
        """
        self.summary_writer = None
        
        with tf.variable_scope(scope):
            self.scope = scope
            # Placeholders for our input
            self.X_pl = tf.placeholder(shape=(None, state_dim), dtype=tf.float32, name="X")
            # The TD target value
            self.y_pl = tf.placeholder(shape=(None,), dtype=tf.float32, name="y")
            
            self.weights = tf.placeholder(shape=(None,), dtype=tf.float32, name="weights")
            
            self.actions_pl = tf.placeholder(shape=(None,), dtype=tf.int32, name="act")
            
            #batch_size = tf.shape(self.X_pl)[0]
            
            with tf.name_scope('fcl1'):
                self.fcl1 = tf.layers.dense( self.X_pl, 128, activation=tf.nn.relu, name='fcl1')

            # common layer between advantage and value networks
            if deulling is True:
                # value prediction layers
                with tf.name_scope('fcl2_value'):
                    self.fcl2_value = tf.layers.dense( self.fcl1, 128, activation=tf.nn.relu, name='fcl2_value' )

                with tf.name_scope('fcl3_value'):
                    self.value_prediction = tf.layers.dense( self.fcl2_value, 1, name='fcl3_value')

                # advantage prediction layers   
                with tf.name_scope('fcl2_advantage'):
                    self.fcl2_advantage = tf.layers.dense( self.fcl1, 128, activation=tf.nn.relu, name='fcl2_advantage')

                with tf.name_scope('fcl3_advantage'):
                    self.advantage_prediction = tf.layers.dense( self.fcl2_advantage, action_space, name='fcl3_advantage')

                self.mean_advantage = tf.reduce_mean(tf.reduce_mean(self.advantage_prediction))
                self.predictions = self.value_prediction + (self.advantage_prediction - self.mean_advantage)
            
            else:                
                with tf.name_scope('fcl2'):
                    self.fcl2 = tf.layers.dense( self.fcl1, 128, activation=tf.nn.relu )
                
                with tf.name_scope('fcl3'):
                    self.predictions = tf.layers.dense( self.fcl2, action_space )

            self.indices = tf.concat((tf.expand_dims(tf.range(32), 1), tf.expand_dims(self.actions_pl, 1)), axis=1)
            self.state_action_predictions = tf.gather_nd(self.predictions, self.indices)
            
            # Calculate the loss
            with tf.name_scope('loss'):
                self.losses = tf.squared_difference(self.y_pl, self.state_action_predictions)
                self.weighted_losses = self.losses * self.weights
                self.prios = self.weighted_losses + tf.constant(1e-5)
                self.loss = tf.reduce_mean(self.weighted_losses)

            # Optimizer and training
            with tf.name_scope('train'):
                self.optimizer = tf.train.AdamOptimizer()
                self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

            tf.summary.scalar('loss', self.loss)
            
            # Summaries for Tensorboard
            self.summaries = tf.summary.merge([
                tf.summary.scalar("loss", self.loss),
                tf.summary.histogram("loss_hist", self.losses),
                tf.summary.histogram("q_values_hist", self.predictions),
                tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
            ])
            
            summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
            if not os.path.exists(summary_dir):
                os.makedirs(summary_dir)
            self.summary_writer = tf.summary.FileWriter(summary_dir)
            self.summary_writer.add_graph(sess.graph)

    
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y, w, replay, buf_ptrs):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y , self.actions_pl: a, self.weights: w }
        summaries, global_step, loss, prios,values, mean,  _ = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.loss, self.prios, self.value_prediction,self.mean_advantage,  self.train_op],
            feed_dict)
        replay.update(buf_ptrs, prios)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [28]:
class PriorityBuffer():
    """
    A quick and dirty Priority Buffer implementation using numpy ()

    Args:
      capacity: capacity of priority buffer 
      action_dim: dimension of actions (usually just 1 for openai envs)
      state_dim: dimension of states
    """
    def __init__(self, capacity, action_dim, state_dim):
        self.capacity = capacity
        self.state_dim = state_dim
        self.action_dim = 1
        self.reward_dim = 1
        self.priority_dim = 1
        self.done_dim = 1
        self.buffer = np.zeros(( capacity, self.priority_dim + self.state_dim + self.action_dim + self.reward_dim + self.state_dim + self.done_dim ), dtype=np.float64)
        self.ptr = 0
        self.size = 0
        
    def add(self, state, action, reward, next_state, done):
        """
        Adds a state, action, reward, next_state tuple into buffer
        Args: 
        state: current state 
        action: action taken in current state
        reward: immediate reward recieved from current action 
        next_state: next state the agent transitions into after taking action 
        done: indicates whether the current episode is done
        """
        # assign highest priority to any new transitions
        priority = np.max(self.buffer[:, 0]) if self.size > 0 else 1.0
        self.buffer[self.ptr, :] = np.concatenate((np.array([priority]), state, np.array([action, reward]), next_state, np.array([done])), axis=0)
        
        self.ptr = (self.ptr + 1) % self.capacity
        if self.size < self.capacity: 
            self.size += 1
    
    
    def update(self, buffer_ptrs, priorities):
        """
        Utility function for updating all the priorities of the priority buffer (called after new TD errors )
        """
        self.buffer[buffer_ptrs, 0] = priorities
    
    # return associated buffer pointers for easy update op
    def sample(self, batch_size, alpha , beta):
        """
        Used for sampling the priority buffer via prioritized sampling
        
        Args: 
        batch_size: the size of the training batch
        beta: a weight used to adjust the importance sampling weights
        
        """
        # secondary values used for computation
        priority_sum = np.sum(self.buffer[:self.size, 0])
        max_priority = np.max(self.buffer[:self.size, 0])
        
        # prioritized sampling
        probs = ( self.buffer[:self.size, 0] ) / priority_sum
        sample_indices = np.random.choice(self.size, batch_size, p=probs )
        states = self.buffer[sample_indices, 1:1 + self.state_dim]
        actions = self.buffer[sample_indices, 1 + self.state_dim:1 + self.state_dim + self.action_dim]
        rewards = self.buffer[sample_indices, 1 + self.state_dim + self.action_dim:1 + self.state_dim + self.action_dim + 1]
        next_states = self.buffer[sample_indices, self.state_dim+1+self.action_dim+1: self.state_dim * 2 + 1 + self.action_dim + 1]
        dones = self.buffer[sample_indices, self.state_dim * 2 + 1 + self.action_dim + 1 :]
        # importance sampling weights
        weights  = ( ( self.size * probs[sample_indices] ) ** ( -beta ) ) 
        weights /= np.max(weights)
        return sample_indices, states, actions.squeeze(), rewards.squeeze(), next_states, dones.squeeze(), weights
        

In [29]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)
    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
    sess.run(update_ops)

In [30]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    
    return policy_fn

In [31]:
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    n_steps,
                    replay_memory_init_size=64,
                    replay_memory_size=10000,
                    update_target_estimator_every=100,
                    discount_factor=0.99,
                    batch_size=32):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation and Priorized Importance sampling.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        n_steps: number of steps to run for
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        batch_size: Size of batches to sample from the replay memory

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The replay memory
    replay_memory = PriorityBuffer(replay_memory_size, 1, env.observation_space.shape[0])
    
    writer = tf.summary.FileWriter("/tmp/rl_research/l")
    writer.add_graph(sess.graph)
    
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(10000),
        episode_rewards=np.zeros(10000))
    
    # epsilon decay tracking
    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 1000

    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    
    # The epsilon decay schedule
    epsilons = [epsilon_by_frame(i) for i in range(10000)]
    betas = np.linspace(0.4, 1.0, 1000)
    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, env.action_space.n)

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    
    done = True
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())
    
    losses = []
    for i in range(replay_memory_init_size):
        epsilon = epsilons[min(total_t, len(epsilons)-1)]
        if done: 
            state = env.reset()
            action = np.random.choice(range(env.action_space.n), p=policy(sess, state, epsilon))
        else:
            action = np.random.choice(range(env.action_space.n), p=policy(sess, state, epsilon))
            state = next_state
        next_state, reward, done, _ = env.step(action)
        replay_memory.add(state, action, reward, next_state, done)
        total_t += 1
        
    loss = None
    print ('Finished Init Repo Memory')
    i_episode = 0
    for step in range(n_steps):

        # Reset the environment
        if done: 
            state = env.reset()
            # Add summaries to tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
            episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)
            q_estimator.summary_writer.flush()
            yield total_t, plotting.EpisodeStats(
                episode_lengths=stats.episode_lengths[:i_episode+1],
                episode_rewards=stats.episode_rewards[:i_episode+1])
            i_episode += 1
        

        epsilon = epsilons[min(total_t, len(epsilons)-1)]


        if total_t % update_target_estimator_every == 0:
            copy_model_parameters (sess ,q_estimator ,target_estimator)

        sys.stdout.flush()


        action = np.random.choice(range(env.action_space.n), p=policy(sess, state, epsilon))
        next_state, reward, done, _ = env.step(action)

        replay_memory.add(state, action, reward, next_state, done)

        buffer_ptrs, minibatch_states, minibatch_actions, minibatch_rewards, minibatch_next_states, minibatch_dones, minibatch_weights = replay_memory.sample(batch_size, 0.6, betas[min(len(betas) - 1,i_episode)])
        
        # ****DOUBLE DQN COMES HERE****
        # decoupling action selection (using q_estimator) from action evaluation (using target_estimator)
        max_actions = np.argmax(q_estimator.predict(sess, minibatch_next_states), axis=1)
        targets = minibatch_rewards + (1 - minibatch_dones) * discount_factor * target_estimator.predict(sess, minibatch_next_states)[np.arange(len(minibatch_actions)), max_actions]
        #print (1 - minibatch_dones)
        # use loss and do backprop update on neural network estimator
        loss = q_estimator.update(sess, minibatch_states, minibatch_actions, targets, minibatch_weights, replay_memory, buffer_ptrs)
        losses.append(loss)

        stats.episode_rewards[i_episode] += reward
        stats.episode_lengths[i_episode] = step

        state = next_state
        total_t += 1

    return stats

In [32]:
import shutil
from datetime import datetime
now = datetime.now()

tf.reset_default_graph()

experiment_dir_q = os.path.abspath('./experiments')
experiment_dir_target = os.path.abspath('./experiments_target')

global_step = tf.Variable(0, name="global_step", trainable=False)


# Run it!
with tf.Session() as sess:
    # Create estimators
    q_estimator = Estimator(sess, env.observation_space.shape[0], env.action_space.n, scope='q_estimator', summaries_dir=experiment_dir_q, deulling=True)
    target_estimator = Estimator(sess, env.observation_space.shape[0], env.action_space.n, scope='target_estimator', summaries_dir=experiment_dir_target, deulling=True)
    sess.run(tf.initialize_all_variables())
    for t, stats in deep_q_learning(sess,
                        env,
                        q_estimator=q_estimator,
                        target_estimator=target_estimator,
                        n_steps=10000,
                        replay_memory_size=1000,
                        replay_memory_init_size=64,
                        update_target_estimator_every=100,
                        discount_factor=0.99,
                        batch_size=32):

        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))
        #pass


Populating replay memory...
Finished Init Repo Memory

Episode Reward: 11.0

Episode Reward: 15.0

Episode Reward: 23.0

Episode Reward: 15.0

Episode Reward: 13.0

Episode Reward: 19.0

Episode Reward: 19.0

Episode Reward: 17.0

Episode Reward: 14.0

Episode Reward: 21.0

Episode Reward: 45.0

Episode Reward: 17.0

Episode Reward: 11.0

Episode Reward: 11.0

Episode Reward: 9.0

Episode Reward: 23.0

Episode Reward: 22.0

Episode Reward: 11.0

Episode Reward: 17.0

Episode Reward: 17.0

Episode Reward: 23.0

Episode Reward: 35.0

Episode Reward: 18.0

Episode Reward: 12.0

Episode Reward: 19.0

Episode Reward: 28.0

Episode Reward: 40.0

Episode Reward: 46.0

Episode Reward: 44.0

Episode Reward: 14.0

Episode Reward: 80.0

Episode Reward: 10.0

Episode Reward: 10.0

Episode Reward: 40.0

Episode Reward: 13.0

Episode Reward: 12.0

Episode Reward: 12.0

Episode Reward: 103.0

Episode Reward: 35.0

Episode Reward: 42.0

Episode Reward: 200.0

Episode Reward: 88.0

Episode Reward: 127.