## Instructions
Read the README for instructions to create the conda environment and
create a ipykernel to use that environment in a Jupyter notebook.

### DON'T RUN THE TRAINING LOCALLY. IT'LL TAKE TOO LONG
[Here is a link to a colab notebook](https://colab.research.google.com/drive/1ov-ARfduhLPm-hUbw90GvUV28XLe6_w6?usp=sharing) if you want to train on one of their gpu's
Then you can import the .h5 file and just play. The models are built the same

##### Start off with all of our imports

In [1]:
# Disable TensorFlow Warnings(Because I don't like seeing them)
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

  return f(*args, **kwds)


In [2]:
# Imports for the whole notebook
import gym
import numpy as np

from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, GreedyQPolicy, LinearAnnealedPolicy

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, Reshape, MaxPooling2D
from keras.optimizers import Adam

Using TensorFlow backend.


##### In this section, we're going to create both train and play environments and set some "global" (to this notebook) variables

In [3]:
ATARI_ENV = 'Breakout-v4'
train_env = gym.make(ATARI_ENV)
np.random.seed(42)
train_env.seed(42)

play_env = gym.make(ATARI_ENV)
np.random.seed(42)
play_env.seed(42)

[42, 742738649]

In [4]:
nb_train_actions = train_env.action_space.n
nb_play_actions = play_env.action_space.n

In [5]:
# These are basically all of the variables we can play with to try to get it
# to learn better
nb_steps_fit = 750
nb_steps_warmup = nb_steps_fit / 3
update = 0.1
epsilon = 0.25
learning_rate = 1e-3
window_size = 3

In [6]:
train_model = Sequential()
'''
There were some real shenanigans here. The train_dqn.fit was outputing
a tensor of shape (1, 3, 210, 160, 3) and I had no idea what the 1 was
The other shapes were:    3 = num images
                          210 = image height
                          160 = image width
                          3 = color channels
So, I attempted to stack by width to get it to a shape that the Conv2D layer
would accept. Same thing for the play_model below.
'''
train_model.add(Reshape((210, 160 * window_size, 3), input_shape=(window_size, 210, 160, 3)))
train_model.add(Conv2D(filters=(32), kernel_size=(3, 3), activation='relu'))
train_model.add(MaxPooling2D((2, 2)))
train_model.add(Conv2D(filters=(64), kernel_size=(3, 3), activation='relu'))
train_model.add(MaxPooling2D((2, 2)))
train_model.add(MaxPooling2D((2, 2)))
# train_model.add(Conv2D(filters=(128), kernel_size=(3, 3), activation='relu'))
train_model.add(Flatten())
train_model.add(Dense(128, activation='relu'))
train_model.add(Dense(256, activation='relu'))
train_model.add(Dense(nb_train_actions, activation='linear'))

train_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 210, 480, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 208, 478, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 104, 239, 32)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 102, 237, 64)      18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 51, 118, 64)       0         
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 25, 59, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 94400)            

In [7]:
memory = SequentialMemory(limit=500, window_length=window_size)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=epsilon), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=100)
train_dqn = DQNAgent(model=train_model, nb_actions=nb_train_actions,
                     memory=memory, nb_steps_warmup=nb_steps_warmup,
                     target_model_update=update, policy=policy)
train_dqn.compile(Adam(lr=learning_rate), metrics=['mae'])


In [8]:
fit = train_dqn.fit(train_env, nb_steps=nb_steps_fit, visualize=False, verbose=2)


Training for 750 steps ...
 354/750: episode: 1, duration: 268.033s, episode steps: 354, steps per second: 1, episode reward: 2.000, mean reward: 0.006 [0.000, 1.000], mean action: 1.771 [0.000, 3.000], mean observation: 40.430 [0.000, 200.000], loss: 3957.981125, mean_absolute_error: 61.421241, mean_q: -2.893220, mean_eps: 0.100000
 667/750: episode: 2, duration: 786.773s, episode steps: 313, steps per second: 0, episode reward: 2.000, mean reward: 0.006 [0.000, 1.000], mean action: 1.380 [0.000, 3.000], mean observation: 40.527 [0.000, 200.000], loss: 1.242730, mean_absolute_error: 18.724601, mean_q: -23.159751, mean_eps: 0.100000
done, took 1240.089 seconds


In [9]:
train_dqn.save_weights('policy.h5', overwrite=True)

## Play Environment

In [10]:
play_model = Sequential()

play_model.add(Reshape((210, 160 * window_size, 3), input_shape=(window_size, 210, 160, 3)))
play_model.add(Conv2D(filters=(32), kernel_size=(3, 3), activation='relu'))
play_model.add(MaxPooling2D((2, 2)))
play_model.add(Conv2D(filters=(64), kernel_size=(3, 3), activation='relu'))
play_model.add(MaxPooling2D((2, 2)))
play_model.add(MaxPooling2D((2, 2)))
# play_model.add(Conv2D(filters=(128), kernel_size=(3, 3), activation='relu'))
play_model.add(Flatten())
play_model.add(Dense(128, activation='relu'))
play_model.add(Dense(256, activation='relu'))
play_model.add(Dense(nb_train_actions, activation='linear'))

play_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_2 (Reshape)          (None, 210, 480, 3)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 208, 478, 32)      896       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 104, 239, 32)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 102, 237, 64)      18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 51, 118, 64)       0         
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 25, 59, 64)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 94400)            

In [11]:
memory = SequentialMemory(limit=1000, window_length=window_size)
policy = GreedyQPolicy()
play_dqn = DQNAgent(model=play_model, nb_actions=nb_play_actions,
                    memory=memory, nb_steps_warmup=nb_steps_warmup,
                    target_model_update=update, policy=policy)
play_dqn.compile(Adam(lr=learning_rate), metrics=['mae'])

In [12]:
play_dqn.load_weights('./policy.h5')

In [14]:
play_dqn.test(play_env, nb_episodes=5, visualize=True,
              nb_max_episode_steps=500)

Testing for 5 episodes ...


Episode 1: reward: 2.000, steps: 500
Episode 2: reward: 0.000, steps: 500
Episode 3: reward: 0.000, steps: 500
Episode 4: reward: 0.000, steps: 500
Episode 5: reward: 0.000, steps: 500


<keras.callbacks.History at 0x7f6f3c957a58>