In [1]:
%matplotlib inline

import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf

if "../" not in sys.path:
  sys.path.append("../")

import plotting
from collections import deque, namedtuple

In [2]:
env = gym.envs.make("Breakout-v0")

[2017-04-21 15:51:54,192] Making new env: Breakout-v0


In [3]:
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [5]:
class Estimator():
    """Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """

    def __init__(self, scope="estimator", summaries_dir=None):
        self.scope = scope
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

    def _build_model(self):
        """
        Builds the Tensorflow graph.
        """

        # Placeholders for our input
        # Our input are 4 RGB frames of shape 160, 160 each
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]

        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        # Fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calcualte the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])

    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [6]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [7]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [8]:
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state


    # Record videos
    # Add env Monitor wrapper
    env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    return stats

In [9]:
tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

# Create a glboal step variable
global_step = tf.Variable(0, name='global_step', trainable=False)
    
# Create estimators
q_estimator = Estimator(scope="q", summaries_dir=experiment_dir)
target_estimator = Estimator(scope="target_q")

# State processor
state_processor = StateProcessor()

# Run it!
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                    env,
                                    q_estimator=q_estimator,
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=10000,
                                    replay_memory_size=500000,
                                    replay_memory_init_size=50000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1.0,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=5000,
                                    discount_factor=0.99,
                                    batch_size=32,
                                    record_video_every=25):

        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Loading model checkpoint C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\checkpoints\model...

Populating replay memory...


[2017-04-21 16:00:32,218] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000000.mp4


Step 747 (537008) @ Episode 1/10000, loss: 0.0013130886945873546
Episode Reward: 9.0
Step 1193 (538201) @ Episode 2/10000, loss: 0.0039718081243336274
Episode Reward: 28.0
Step 638 (538839) @ Episode 3/10000, loss: 0.0074564456008374694
Episode Reward: 8.0
Step 978 (539817) @ Episode 4/10000, loss: 0.0026686000637710094
Episode Reward: 21.0
Step 182 (539999) @ Episode 5/10000, loss: 0.0017242176691070242
Copied model parameters to target network.
Step 1070 (540887) @ Episode 5/10000, loss: 0.0031045754440128803
Episode Reward: 26.0
Step 935 (541822) @ Episode 6/10000, loss: 0.0032195870298892265
Episode Reward: 14.0
Step 1707 (543529) @ Episode 7/10000, loss: 0.0018776498036459088
Episode Reward: 38.0
Step 714 (544243) @ Episode 8/10000, loss: 0.0263510756194591522
Episode Reward: 13.0
Step 843 (545086) @ Episode 9/10000, loss: 0.0015223659574985504
Episode Reward: 14.0
Step 967 (546053) @ Episode 10/10000, loss: 0.0029497803188860416
Episode Reward: 16.0
Step 913 (546966) @ Episode 11

[2017-04-22 00:54:38,273] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000025.mp4


Step 1110 (561961) @ Episode 26/10000, loss: 0.0020231464877724648
Episode Reward: 23.0
Step 817 (562778) @ Episode 27/10000, loss: 0.0023309942334890366
Episode Reward: 14.0
Step 931 (563709) @ Episode 28/10000, loss: 0.0015298910439014435
Episode Reward: 16.0
Step 1149 (564858) @ Episode 29/10000, loss: 0.0037581166252493865
Episode Reward: 24.0
Step 1387 (566245) @ Episode 30/10000, loss: 0.0078518465161323556
Episode Reward: 30.0
Step 1171 (567416) @ Episode 31/10000, loss: 0.0023411428555846214
Episode Reward: 25.0
Step 882 (568298) @ Episode 32/10000, loss: 0.0008527461905032396
Episode Reward: 14.0
Step 852 (569150) @ Episode 33/10000, loss: 0.00214619701728224756
Episode Reward: 14.0
Step 849 (569999) @ Episode 34/10000, loss: 0.00356583157554268843
Copied model parameters to target network.
Step 919 (570069) @ Episode 34/10000, loss: 0.0089018382132053387
Episode Reward: 16.0
Step 1137 (571206) @ Episode 35/10000, loss: 0.0177932623773813256
Episode Reward: 23.0
Step 1085 (572

[2017-04-22 03:29:48,233] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000050.mp4


Step 584 (586816) @ Episode 51/10000, loss: 0.0024636820890009403
Episode Reward: 9.0
Step 1435 (588251) @ Episode 52/10000, loss: 0.0027625896036624916
Episode Reward: 32.0
Step 767 (589018) @ Episode 53/10000, loss: 0.0033588022924959666
Episode Reward: 10.0
Step 963 (589981) @ Episode 54/10000, loss: 0.0016912777209654454
Episode Reward: 21.0
Step 18 (589999) @ Episode 55/10000, loss: 0.0043867975473403933
Copied model parameters to target network.
Step 833 (590814) @ Episode 55/10000, loss: 0.0037047625519335275
Episode Reward: 16.0
Step 1000 (591814) @ Episode 56/10000, loss: 0.0027993228286504745
Episode Reward: 20.0
Step 748 (592562) @ Episode 57/10000, loss: 0.0020372213330119854
Episode Reward: 19.0
Step 805 (593367) @ Episode 58/10000, loss: 0.0017923295963555574
Episode Reward: 21.0
Step 906 (594273) @ Episode 59/10000, loss: 0.0041341111063957214
Episode Reward: 15.0
Step 839 (595112) @ Episode 60/10000, loss: 0.0276703368872404153
Episode Reward: 20.0
Step 1068 (596180) @ 

[2017-04-22 05:43:22,037] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000075.mp4


Step 623 (608532) @ Episode 76/10000, loss: 0.0037103537470102317
Episode Reward: 9.0
Step 740 (609272) @ Episode 77/10000, loss: 0.0024918932467699053
Episode Reward: 14.0
Step 727 (609999) @ Episode 78/10000, loss: 0.0027835727669298653
Copied model parameters to target network.
Step 1049 (610321) @ Episode 78/10000, loss: 0.0031819832511246204
Episode Reward: 22.0
Step 644 (610965) @ Episode 79/10000, loss: 0.0148785803467035357
Episode Reward: 12.0
Step 1134 (612099) @ Episode 80/10000, loss: 0.0039373808540403842
Episode Reward: 27.0
Step 1306 (613405) @ Episode 81/10000, loss: 0.0021230592392385006
Episode Reward: 31.0
Step 582 (613987) @ Episode 82/10000, loss: 0.0039422130212187775
Episode Reward: 8.0
Step 924 (614911) @ Episode 83/10000, loss: 0.0039977175183594233
Episode Reward: 15.0
Step 1067 (615978) @ Episode 84/10000, loss: 0.0040218681097030647
Episode Reward: 15.0
Step 771 (616749) @ Episode 85/10000, loss: 0.0035931358579546213
Episode Reward: 14.0
Step 944 (617693) @

[2017-04-22 07:50:43,492] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000100.mp4


Step 848 (629119) @ Episode 101/10000, loss: 0.0017954665236175069
Episode Reward: 17.0
Step 340 (629459) @ Episode 102/10000, loss: 0.0019484555814415216
Episode Reward: 4.0
Step 540 (629999) @ Episode 103/10000, loss: 0.0030596330761909485
Copied model parameters to target network.
Step 923 (630382) @ Episode 103/10000, loss: 0.0059623103588819554
Episode Reward: 19.0
Step 916 (631298) @ Episode 104/10000, loss: 0.0086729293689131746
Episode Reward: 14.0
Step 869 (632167) @ Episode 105/10000, loss: 0.0068852454423904426
Episode Reward: 18.0
Step 685 (632852) @ Episode 106/10000, loss: 0.0052137495949864395
Episode Reward: 11.0
Step 1123 (633975) @ Episode 107/10000, loss: 0.0028420747257769108
Episode Reward: 18.0
Step 1304 (635279) @ Episode 108/10000, loss: 0.0019181842217221856
Episode Reward: 45.0
Step 783 (636062) @ Episode 109/10000, loss: 0.0026263399049639784
Episode Reward: 12.0
Step 784 (636846) @ Episode 110/10000, loss: 0.0032066460698843002
Episode Reward: 17.0
Step 944 

[2017-04-22 15:28:45,976] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000125.mp4


Step 1026 (651139) @ Episode 126/10000, loss: 0.0237857326865196238
Episode Reward: 15.0
Step 1174 (652313) @ Episode 127/10000, loss: 0.0026527259033173323
Episode Reward: 21.0
Step 850 (653163) @ Episode 128/10000, loss: 0.0067873364314436917
Episode Reward: 21.0
Step 718 (653881) @ Episode 129/10000, loss: 0.0030155994463711977
Episode Reward: 11.0
Step 923 (654804) @ Episode 130/10000, loss: 0.0160769317299127585
Episode Reward: 19.0
Step 392 (655196) @ Episode 131/10000, loss: 0.0025855023413896563
Episode Reward: 4.0
Step 1089 (656285) @ Episode 132/10000, loss: 0.0189290046691894532
Episode Reward: 19.0
Step 955 (657240) @ Episode 133/10000, loss: 0.0048877447843551645
Episode Reward: 18.0
Step 834 (658074) @ Episode 134/10000, loss: 0.0048808818683028224
Episode Reward: 16.0
Step 826 (658900) @ Episode 135/10000, loss: 0.0046622594818472862
Episode Reward: 17.0
Step 863 (659763) @ Episode 136/10000, loss: 0.0039520068094134335
Episode Reward: 18.0
Step 236 (659999) @ Episode 13

[2017-04-22 18:00:39,509] Starting new video recorder writing to C:\Users\Brad Pospeck\Desktop\Classes\Senior II\CS480\Project\experiments\Breakout-v0\monitor\openaigym.video.0.13868.video000150.mp4


Step 818 (671835) @ Episode 151/10000, loss: 0.0070923282764852058
Episode Reward: 13.0
Step 27 (671862) @ Episode 152/10000, loss: 0.0026051653549075127

KeyboardInterrupt: 