# Imports

In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import random
import numpy as np
import datetime as dt
import imageio
from tensorflow import summary

# Initial Parameters

In [2]:
MAX_EPSILON = 1
MIN_EPSILON = 0.1
EPSILON_MIN_ITER = 500000
GAMMA = 0.99
BATCH_SIZE = 32
TAU = 0.08
POST_PROCESS_IMAGE_SIZE = (105, 80, 1)
DELAY_TRAINING = 50000
NUM_FRAMES = 4
GIF_RECORDING_FREQ = 100

env = gym.make("SpaceInvaders-v0")
num_actions = env.action_space.n

# Model

In [3]:
class DQModel(keras.Model):
    def __init__(self, hidden_size: int, num_actions: int, dueling: bool):
        super(DQModel, self).__init__()
        self.dueling = dueling
        self.conv1 = keras.layers.Conv2D(16, (8, 8), (4, 4), activation='relu')
        self.conv2 = keras.layers.Conv2D(32, (4, 4), (2, 2), activation='relu')
        self.flatten = keras.layers.Flatten()
        self.adv_dense = keras.layers.Dense(hidden_size, activation='relu', kernel_initializer=keras.initializers.he_normal())
        self.adv_out = keras.layers.Dense(num_actions, kernel_initializer=keras.initializers.he_normal())
        if dueling:
            self.v_dense = keras.layers.Dense(hidden_size, activation='relu', kernel_initializer=keras.initializers.he_normal())
            self.v_out = keras.layers.Dense(1, kernel_initializer=keras.initializers.he_normal())
            self.lambda_layer = keras.layers.Lambda(lambda x: x - tf.reduce_mean(x))
            self.combine = keras.layers.Add()

    def call(self, input):
        x = self.conv1(input)
        x = self.conv2(x)
        x = self.flatten(x)
        adv = self.adv_dense(x)
        adv = self.adv_out(adv)
        if self.dueling:
            v = self.v_dense(x)
            v = self.v_out(v)
            norm_adv = self.lambda_layer(adv)
            combined = self.combine([v, norm_adv])
            return combined
        return adv

In [4]:
primary_network = DQModel(256, num_actions, False)
target_network = DQModel(256, num_actions, False)
primary_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')
# make target_network = primary_network
for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
    t.assign(e)

primary_network.compile(optimizer=keras.optimizers.Adam(), loss=tf.keras.losses.Huber())

# Memory

In [5]:
class Memory:
    def __init__(self, max_memory):
        self._max_memory = max_memory
        self._actions = np.zeros(max_memory, dtype=np.int32)
        self._rewards = np.zeros(max_memory, dtype=np.float32)
        self._frames = np.zeros((POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], max_memory), dtype=np.float32)
        self._terminal = np.zeros(max_memory, dtype=np.bool)
        self._i = 0

    def add_sample(self, frame, action, reward, terminal):
        self._actions[self._i] = action
        self._rewards[self._i] = reward
        self._frames[:, :, self._i] = frame[:, :, 0]
        self._terminal[self._i] = terminal
        if self._i % (self._max_memory - 1) == 0 and self._i != 0:
            self._i = BATCH_SIZE + NUM_FRAMES + 1
        else:
            self._i += 1

    def sample(self):
        if self._i < BATCH_SIZE + NUM_FRAMES + 1:
            raise ValueError("Not enough memory to extract a batch")
        else:
            rand_idxs = np.random.randint(NUM_FRAMES + 1, self._i, size=BATCH_SIZE)
            states = np.zeros((BATCH_SIZE, POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES), dtype=np.float32)
            next_states = np.zeros((BATCH_SIZE, POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES), dtype=np.float32)
            for i, idx in enumerate(rand_idxs):
                states[i] = self._frames[:, :, idx - 1 - NUM_FRAMES:idx - 1]
                next_states[i] = self._frames[:, :, idx - NUM_FRAMES:idx]
            return states, self._actions[rand_idxs], self._rewards[rand_idxs], next_states, self._terminal[rand_idxs]

In [6]:
# memory = Memory(500000)
memory = Memory(1000)

# Extra Processing

In [7]:
def image_preprocess(image, new_size=(105, 80)):
    # convert to greyscale, resize and normalize the image
    image = tf.image.rgb_to_grayscale(image)
    image = tf.image.resize(image, new_size)
    image = image / 255
    return image

In [8]:
def choose_action(state, primary_network, eps, step):
    if step < DELAY_TRAINING:
        return random.randint(0, num_actions - 1)
    else:
        if random.random() < eps:
            return random.randint(0, num_actions - 1)
        else:
            return np.argmax(primary_network(tf.reshape(state, (1, POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES)).numpy()))

In [9]:
def update_network(primary_network, target_network):
    # update target network parameters slowly from primary network
    for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
        t.assign(t * (1 - TAU) + e * TAU)

In [10]:
def process_state_stack(state_stack, state):
    for i in range(1, state_stack.shape[-1]):
        state_stack[:, :, i - 1].assign(state_stack[:, :, i])
    state_stack[:, :, -1].assign(state[:, :, 0])
    return state_stack

In [11]:
def record_gif(frame_list, episode, fps=50):
    imageio.mimsave(f"SPACE_INVADERS_EPISODE-{episode}.gif", frame_list, fps=fps) #duration=duration_per_frame)

# Train Model

In [12]:
def train(primary_network, memory, target_network=None):
    states, actions, rewards, next_states, terminal = memory.sample()
    # predict Q(s,a) given the batch of states
    prim_qt = primary_network(states)
    # predict Q(s',a') from the evaluation network
    prim_qtp1 = primary_network(next_states)
    # copy the prim_qt tensor into the target_q tensor - we then will update one index corresponding to the max action
    target_q = prim_qt.numpy()
    updates = rewards
    valid_idxs = terminal != True
    batch_idxs = np.arange(BATCH_SIZE)
    if target_network is None:
        updates[valid_idxs] += GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1)
    else:
        prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
        q_from_target = target_network(next_states)
        updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]]
    target_q[batch_idxs, actions] = updates
    loss = primary_network.train_on_batch(states, target_q)
    return loss

In [13]:
def prepare_tensorboard():
    current_time = str(dt.datetime.now().strftime('%d%m%Y%H%M'))
    log_dir = 'logs/dqn/' + current_time
    return summary.create_file_writer(log_dir)

def main():
    num_episodes = 2000
    eps = MAX_EPSILON
    render = False
    train_writer = prepare_tensorboard()
    double_q = False
    steps = 0
    for i in range(num_episodes):
        state = env.reset()
        state = image_preprocess(state)
        state_stack = tf.Variable(np.repeat(state.numpy(), NUM_FRAMES).reshape((POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES)))
        cnt = 1
        avg_loss = 0
        tot_reward = 0
        if i % GIF_RECORDING_FREQ == 0:
            frame_list = []
        while True:
            if render:
                env.render()
            action = choose_action(state_stack, primary_network, eps, steps)
            next_state, reward, done, info = env.step(action)
            tot_reward += reward
            if i % GIF_RECORDING_FREQ == 0:
                frame_list.append(tf.cast(tf.image.resize(next_state, (480, 320)), tf.uint8).numpy())
            next_state = image_preprocess(next_state)
            state_stack = process_state_stack(state_stack, next_state)
            # store in memory
            memory.add_sample(next_state, action, reward, done)

            if steps > DELAY_TRAINING:
                loss = train(primary_network, memory, target_network if double_q else None)
                update_network(primary_network, target_network)
            else:
                loss = -1
            avg_loss += loss

            # linearly decay the eps value
            if steps > DELAY_TRAINING:
                eps = MAX_EPSILON - ((steps - DELAY_TRAINING) / EPSILON_MIN_ITER) * \
                    (MAX_EPSILON - MIN_EPSILON) if steps < EPSILON_MIN_ITER else \
                    MIN_EPSILON
            steps += 1

            if done:
                if steps > DELAY_TRAINING:
                    avg_loss /= cnt
                    print(f"Episode: {i}, Reward: {tot_reward}, avg loss: {avg_loss:.5f}, eps: {eps:.3f}")
                    with train_writer.as_default():
                        tf.summary.scalar('reward', tot_reward, step=i)
                        tf.summary.scalar('avg loss', avg_loss, step=i)
                else:
                    print(f"Pre-training...Episode: {i}")
                if i % GIF_RECORDING_FREQ == 0:
                    record_gif(frame_list, i)
                break

            cnt += 1

# Execute

In [14]:
main()

Pre-training...Episode: 0
Pre-training...Episode: 1
Pre-training...Episode: 2
Pre-training...Episode: 3
Pre-training...Episode: 4
Pre-training...Episode: 5
Pre-training...Episode: 6
Pre-training...Episode: 7
Pre-training...Episode: 8
Pre-training...Episode: 9
Pre-training...Episode: 10
Pre-training...Episode: 11
Pre-training...Episode: 12
Pre-training...Episode: 13
Pre-training...Episode: 14
Pre-training...Episode: 15
Pre-training...Episode: 16
Pre-training...Episode: 17
Pre-training...Episode: 18
Pre-training...Episode: 19
Pre-training...Episode: 20
Pre-training...Episode: 21
Pre-training...Episode: 22
Pre-training...Episode: 23
Pre-training...Episode: 24
Pre-training...Episode: 25
Pre-training...Episode: 26
Pre-training...Episode: 27
Pre-training...Episode: 28
Pre-training...Episode: 29
Pre-training...Episode: 30
Pre-training...Episode: 31
Pre-training...Episode: 32
Pre-training...Episode: 33
Pre-training...Episode: 34
Pre-training...Episode: 35
Pre-training...Episode: 36
Pre-trainin

Episode: 181, Reward: 215.0, avg loss: 0.10937, eps: 0.868
Episode: 182, Reward: 150.0, avg loss: 0.10022, eps: 0.867
Episode: 183, Reward: 225.0, avg loss: 0.06534, eps: 0.865
Episode: 184, Reward: 185.0, avg loss: 0.16843, eps: 0.863
Episode: 185, Reward: 125.0, avg loss: 0.24088, eps: 0.862
Episode: 186, Reward: 210.0, avg loss: 0.10590, eps: 0.861
Episode: 187, Reward: 170.0, avg loss: 0.06437, eps: 0.859
Episode: 188, Reward: 105.0, avg loss: 0.04068, eps: 0.859
Episode: 189, Reward: 85.0, avg loss: 0.04222, eps: 0.857
Episode: 190, Reward: 515.0, avg loss: 0.06016, eps: 0.855
Episode: 191, Reward: 135.0, avg loss: 0.08685, eps: 0.854
Episode: 192, Reward: 105.0, avg loss: 0.05043, eps: 0.853
Episode: 193, Reward: 115.0, avg loss: 0.04939, eps: 0.852
Episode: 194, Reward: 80.0, avg loss: 0.04887, eps: 0.851
Episode: 195, Reward: 110.0, avg loss: 0.17549, eps: 0.850
Episode: 196, Reward: 45.0, avg loss: 0.08904, eps: 0.849
Episode: 197, Reward: 105.0, avg loss: 0.07062, eps: 0.848


Episode: 321, Reward: 85.0, avg loss: 0.22279, eps: 0.694
Episode: 322, Reward: 80.0, avg loss: 0.04945, eps: 0.693
Episode: 323, Reward: 110.0, avg loss: 0.04451, eps: 0.692
Episode: 324, Reward: 30.0, avg loss: 0.03332, eps: 0.692
Episode: 325, Reward: 135.0, avg loss: 0.02876, eps: 0.691
Episode: 326, Reward: 135.0, avg loss: 0.03133, eps: 0.689
Episode: 327, Reward: 10.0, avg loss: 0.01915, eps: 0.689
Episode: 328, Reward: 355.0, avg loss: 0.01712, eps: 0.687
Episode: 329, Reward: 30.0, avg loss: 0.01792, eps: 0.687
Episode: 330, Reward: 80.0, avg loss: 0.01026, eps: 0.685
Episode: 331, Reward: 110.0, avg loss: 0.01485, eps: 0.685
Episode: 332, Reward: 105.0, avg loss: 0.02066, eps: 0.683
Episode: 333, Reward: 110.0, avg loss: 0.02104, eps: 0.682
Episode: 334, Reward: 345.0, avg loss: 0.47315, eps: 0.680
Episode: 335, Reward: 335.0, avg loss: 0.87233, eps: 0.679
Episode: 336, Reward: 305.0, avg loss: 0.11363, eps: 0.677
Episode: 337, Reward: 330.0, avg loss: 0.04942, eps: 0.675
Epi

Episode: 461, Reward: 120.0, avg loss: 3.25836, eps: 0.507
Episode: 462, Reward: 65.0, avg loss: 1.13623, eps: 0.506
Episode: 463, Reward: 60.0, avg loss: 0.56421, eps: 0.505
Episode: 464, Reward: 615.0, avg loss: 1.77847, eps: 0.503
Episode: 465, Reward: 50.0, avg loss: 2.34744, eps: 0.502
Episode: 466, Reward: 135.0, avg loss: 1.58656, eps: 0.501
Episode: 467, Reward: 85.0, avg loss: 1.91406, eps: 0.501
Episode: 468, Reward: 110.0, avg loss: 0.56708, eps: 0.500
Episode: 469, Reward: 190.0, avg loss: 0.59049, eps: 0.498
Episode: 470, Reward: 270.0, avg loss: 0.54459, eps: 0.496
Episode: 471, Reward: 155.0, avg loss: 1.44113, eps: 0.495
Episode: 472, Reward: 180.0, avg loss: 0.51643, eps: 0.494
Episode: 473, Reward: 415.0, avg loss: 1.50916, eps: 0.492
Episode: 474, Reward: 155.0, avg loss: 1.68308, eps: 0.491
Episode: 475, Reward: 155.0, avg loss: 0.90256, eps: 0.489
Episode: 476, Reward: 250.0, avg loss: 0.60321, eps: 0.488
Episode: 477, Reward: 485.0, avg loss: 1.06721, eps: 0.486
E

Episode: 601, Reward: 230.0, avg loss: 1.26319, eps: 0.325
Episode: 602, Reward: 230.0, avg loss: 1.38080, eps: 0.324
Episode: 603, Reward: 105.0, avg loss: 1.79633, eps: 0.323
Episode: 604, Reward: 110.0, avg loss: 1.14804, eps: 0.322
Episode: 605, Reward: 40.0, avg loss: 0.84804, eps: 0.321
Episode: 606, Reward: 210.0, avg loss: 0.85105, eps: 0.319
Episode: 607, Reward: 60.0, avg loss: 1.66579, eps: 0.318
Episode: 608, Reward: 140.0, avg loss: 3.74674, eps: 0.317
Episode: 609, Reward: 160.0, avg loss: 3.53752, eps: 0.316
Episode: 610, Reward: 110.0, avg loss: 3.73176, eps: 0.315
Episode: 611, Reward: 160.0, avg loss: 2.46219, eps: 0.314
Episode: 612, Reward: 100.0, avg loss: 2.55712, eps: 0.312
Episode: 613, Reward: 150.0, avg loss: 2.35617, eps: 0.311
Episode: 614, Reward: 120.0, avg loss: 1.83646, eps: 0.310
Episode: 615, Reward: 145.0, avg loss: 0.81105, eps: 0.308
Episode: 616, Reward: 150.0, avg loss: 2.23849, eps: 0.307
Episode: 617, Reward: 20.0, avg loss: 1.44846, eps: 0.306


Episode: 741, Reward: 95.0, avg loss: 0.01610, eps: 0.100
Episode: 742, Reward: 195.0, avg loss: 0.04013, eps: 0.100
Episode: 743, Reward: 120.0, avg loss: 0.04256, eps: 0.100
Episode: 744, Reward: 290.0, avg loss: 3.93087, eps: 0.100
Episode: 745, Reward: 105.0, avg loss: 4.38098, eps: 0.100
Episode: 746, Reward: 245.0, avg loss: 0.81056, eps: 0.100
Episode: 747, Reward: 160.0, avg loss: 0.11135, eps: 0.100
Episode: 748, Reward: 215.0, avg loss: 0.02205, eps: 0.100
Episode: 749, Reward: 210.0, avg loss: 0.03264, eps: 0.100
Episode: 750, Reward: 160.0, avg loss: 0.02881, eps: 0.100
Episode: 751, Reward: 125.0, avg loss: 0.01935, eps: 0.100
Episode: 752, Reward: 75.0, avg loss: 0.02237, eps: 0.100
Episode: 753, Reward: 150.0, avg loss: 1.05290, eps: 0.100
Episode: 754, Reward: 115.0, avg loss: 3.37121, eps: 0.100
Episode: 755, Reward: 20.0, avg loss: 1.52653, eps: 0.100
Episode: 756, Reward: 145.0, avg loss: 1.40881, eps: 0.100
Episode: 757, Reward: 270.0, avg loss: 4.10719, eps: 0.100


Episode: 881, Reward: 110.0, avg loss: 0.03834, eps: 0.100
Episode: 882, Reward: 110.0, avg loss: 0.02432, eps: 0.100
Episode: 883, Reward: 295.0, avg loss: 1.24987, eps: 0.100
Episode: 884, Reward: 180.0, avg loss: 0.20338, eps: 0.100
Episode: 885, Reward: 185.0, avg loss: 0.09677, eps: 0.100
Episode: 886, Reward: 110.0, avg loss: 0.02385, eps: 0.100
Episode: 887, Reward: 175.0, avg loss: 0.01601, eps: 0.100
Episode: 888, Reward: 110.0, avg loss: 0.02879, eps: 0.100
Episode: 889, Reward: 170.0, avg loss: 0.02316, eps: 0.100
Episode: 890, Reward: 200.0, avg loss: 0.06199, eps: 0.100
Episode: 891, Reward: 60.0, avg loss: 0.01604, eps: 0.100
Episode: 892, Reward: 30.0, avg loss: 0.01380, eps: 0.100
Episode: 893, Reward: 55.0, avg loss: 0.01249, eps: 0.100
Episode: 894, Reward: 30.0, avg loss: 0.01561, eps: 0.100
Episode: 895, Reward: 40.0, avg loss: 0.00564, eps: 0.100
Episode: 896, Reward: 220.0, avg loss: 0.02909, eps: 0.100
Episode: 897, Reward: 60.0, avg loss: 0.01599, eps: 0.100
Epi

Episode: 1021, Reward: 245.0, avg loss: 0.04063, eps: 0.100
Episode: 1022, Reward: 195.0, avg loss: 0.02167, eps: 0.100
Episode: 1023, Reward: 200.0, avg loss: 0.04309, eps: 0.100
Episode: 1024, Reward: 140.0, avg loss: 0.03101, eps: 0.100
Episode: 1025, Reward: 185.0, avg loss: 0.04291, eps: 0.100
Episode: 1026, Reward: 215.0, avg loss: 0.05162, eps: 0.100
Episode: 1027, Reward: 160.0, avg loss: 0.01906, eps: 0.100
Episode: 1028, Reward: 110.0, avg loss: 0.00820, eps: 0.100
Episode: 1029, Reward: 105.0, avg loss: 0.01559, eps: 0.100
Episode: 1030, Reward: 175.0, avg loss: 0.02449, eps: 0.100
Episode: 1031, Reward: 405.0, avg loss: 0.03147, eps: 0.100
Episode: 1032, Reward: 150.0, avg loss: 0.03743, eps: 0.100
Episode: 1033, Reward: 365.0, avg loss: 0.04478, eps: 0.100
Episode: 1034, Reward: 205.0, avg loss: 0.03320, eps: 0.100
Episode: 1035, Reward: 115.0, avg loss: 0.02176, eps: 0.100
Episode: 1036, Reward: 105.0, avg loss: 0.00827, eps: 0.100
Episode: 1037, Reward: 80.0, avg loss: 0

Episode: 1158, Reward: 240.0, avg loss: 0.01175, eps: 0.100
Episode: 1159, Reward: 105.0, avg loss: 0.01452, eps: 0.100
Episode: 1160, Reward: 520.0, avg loss: 0.05089, eps: 0.100
Episode: 1161, Reward: 120.0, avg loss: 0.06136, eps: 0.100
Episode: 1162, Reward: 210.0, avg loss: 0.03563, eps: 0.100
Episode: 1163, Reward: 295.0, avg loss: 0.04714, eps: 0.100
Episode: 1164, Reward: 320.0, avg loss: 0.05170, eps: 0.100
Episode: 1165, Reward: 130.0, avg loss: 0.02609, eps: 0.100
Episode: 1166, Reward: 255.0, avg loss: 0.05306, eps: 0.100
Episode: 1167, Reward: 300.0, avg loss: 0.04571, eps: 0.100
Episode: 1168, Reward: 335.0, avg loss: 0.05126, eps: 0.100
Episode: 1169, Reward: 260.0, avg loss: 0.02412, eps: 0.100
Episode: 1170, Reward: 95.0, avg loss: 0.00901, eps: 0.100
Episode: 1171, Reward: 80.0, avg loss: 0.01740, eps: 0.100
Episode: 1172, Reward: 215.0, avg loss: 0.02357, eps: 0.100
Episode: 1173, Reward: 515.0, avg loss: 0.05987, eps: 0.100
Episode: 1174, Reward: 125.0, avg loss: 0.

Episode: 1295, Reward: 210.0, avg loss: 0.01855, eps: 0.100
Episode: 1296, Reward: 420.0, avg loss: 0.03197, eps: 0.100
Episode: 1297, Reward: 175.0, avg loss: 0.02434, eps: 0.100
Episode: 1298, Reward: 195.0, avg loss: 0.01966, eps: 0.100
Episode: 1299, Reward: 210.0, avg loss: 0.03512, eps: 0.100
Episode: 1300, Reward: 115.0, avg loss: 0.03131, eps: 0.100
Episode: 1301, Reward: 175.0, avg loss: 0.02491, eps: 0.100
Episode: 1302, Reward: 145.0, avg loss: 0.02238, eps: 0.100
Episode: 1303, Reward: 75.0, avg loss: 0.02038, eps: 0.100
Episode: 1304, Reward: 85.0, avg loss: 0.01562, eps: 0.100
Episode: 1305, Reward: 170.0, avg loss: 0.02387, eps: 0.100
Episode: 1306, Reward: 245.0, avg loss: 0.02689, eps: 0.100
Episode: 1307, Reward: 125.0, avg loss: 0.04670, eps: 0.100
Episode: 1308, Reward: 180.0, avg loss: 0.03196, eps: 0.100
Episode: 1309, Reward: 290.0, avg loss: 0.02764, eps: 0.100
Episode: 1310, Reward: 160.0, avg loss: 0.02227, eps: 0.100
Episode: 1311, Reward: 185.0, avg loss: 0.

Episode: 1432, Reward: 290.0, avg loss: 0.45420, eps: 0.100
Episode: 1433, Reward: 30.0, avg loss: 10.25178, eps: 0.100
Episode: 1434, Reward: 95.0, avg loss: 0.68872, eps: 0.100
Episode: 1435, Reward: 165.0, avg loss: 0.02547, eps: 0.100
Episode: 1436, Reward: 110.0, avg loss: 0.03365, eps: 0.100
Episode: 1437, Reward: 155.0, avg loss: 0.02992, eps: 0.100
Episode: 1438, Reward: 140.0, avg loss: 0.09073, eps: 0.100
Episode: 1439, Reward: 115.0, avg loss: 0.05640, eps: 0.100
Episode: 1440, Reward: 90.0, avg loss: 0.00719, eps: 0.100
Episode: 1441, Reward: 110.0, avg loss: 0.02389, eps: 0.100
Episode: 1442, Reward: 130.0, avg loss: 0.43770, eps: 0.100
Episode: 1443, Reward: 380.0, avg loss: 0.33302, eps: 0.100
Episode: 1444, Reward: 450.0, avg loss: 0.07219, eps: 0.100
Episode: 1445, Reward: 355.0, avg loss: 0.01947, eps: 0.100
Episode: 1446, Reward: 435.0, avg loss: 0.06637, eps: 0.100
Episode: 1447, Reward: 315.0, avg loss: 0.03902, eps: 0.100
Episode: 1448, Reward: 165.0, avg loss: 0.

Episode: 1569, Reward: 150.0, avg loss: 0.02489, eps: 0.100
Episode: 1570, Reward: 255.0, avg loss: 0.03447, eps: 0.100
Episode: 1571, Reward: 130.0, avg loss: 0.02231, eps: 0.100
Episode: 1572, Reward: 105.0, avg loss: 0.02034, eps: 0.100
Episode: 1573, Reward: 170.0, avg loss: 0.01705, eps: 0.100
Episode: 1574, Reward: 95.0, avg loss: 0.01084, eps: 0.100
Episode: 1575, Reward: 330.0, avg loss: 0.03010, eps: 0.100
Episode: 1576, Reward: 335.0, avg loss: 0.03188, eps: 0.100
Episode: 1577, Reward: 325.0, avg loss: 0.02485, eps: 0.100
Episode: 1578, Reward: 160.0, avg loss: 0.02567, eps: 0.100
Episode: 1579, Reward: 150.0, avg loss: 0.01534, eps: 0.100
Episode: 1580, Reward: 135.0, avg loss: 0.02186, eps: 0.100
Episode: 1581, Reward: 160.0, avg loss: 0.02353, eps: 0.100
Episode: 1582, Reward: 110.0, avg loss: 0.02254, eps: 0.100
Episode: 1583, Reward: 285.0, avg loss: 0.02125, eps: 0.100
Episode: 1584, Reward: 175.0, avg loss: 0.02764, eps: 0.100
Episode: 1585, Reward: 195.0, avg loss: 0

Episode: 1706, Reward: 155.0, avg loss: 0.04241, eps: 0.100
Episode: 1707, Reward: 265.0, avg loss: 0.06491, eps: 0.100
Episode: 1708, Reward: 230.0, avg loss: 0.04623, eps: 0.100
Episode: 1709, Reward: 250.0, avg loss: 0.05944, eps: 0.100
Episode: 1710, Reward: 40.0, avg loss: 0.00941, eps: 0.100
Episode: 1711, Reward: 255.0, avg loss: 0.01660, eps: 0.100
Episode: 1712, Reward: 230.0, avg loss: 0.03518, eps: 0.100
Episode: 1713, Reward: 420.0, avg loss: 0.08523, eps: 0.100
Episode: 1714, Reward: 445.0, avg loss: 0.09465, eps: 0.100
Episode: 1715, Reward: 210.0, avg loss: 0.02046, eps: 0.100
Episode: 1716, Reward: 525.0, avg loss: 0.02811, eps: 0.100
Episode: 1717, Reward: 165.0, avg loss: 0.01975, eps: 0.100
Episode: 1718, Reward: 170.0, avg loss: 0.01668, eps: 0.100
Episode: 1719, Reward: 460.0, avg loss: 0.05703, eps: 0.100
Episode: 1720, Reward: 75.0, avg loss: 0.03273, eps: 0.100
Episode: 1721, Reward: 175.0, avg loss: 0.04149, eps: 0.100
Episode: 1722, Reward: 110.0, avg loss: 0.

Episode: 1843, Reward: 500.0, avg loss: 0.08318, eps: 0.100
Episode: 1844, Reward: 80.0, avg loss: 0.00333, eps: 0.100
Episode: 1845, Reward: 195.0, avg loss: 0.02290, eps: 0.100
Episode: 1846, Reward: 145.0, avg loss: 0.03519, eps: 0.100
Episode: 1847, Reward: 75.0, avg loss: 0.02461, eps: 0.100
Episode: 1848, Reward: 215.0, avg loss: 0.03477, eps: 0.100
Episode: 1849, Reward: 155.0, avg loss: 0.03079, eps: 0.100
Episode: 1850, Reward: 105.0, avg loss: 0.02335, eps: 0.100
Episode: 1851, Reward: 225.0, avg loss: 0.01823, eps: 0.100
Episode: 1852, Reward: 125.0, avg loss: 0.03331, eps: 0.100
Episode: 1853, Reward: 145.0, avg loss: 0.02277, eps: 0.100
Episode: 1854, Reward: 215.0, avg loss: 0.03545, eps: 0.100
Episode: 1855, Reward: 120.0, avg loss: 0.01864, eps: 0.100
Episode: 1856, Reward: 105.0, avg loss: 0.01567, eps: 0.100
Episode: 1857, Reward: 155.0, avg loss: 0.01271, eps: 0.100
Episode: 1858, Reward: 330.0, avg loss: 0.04669, eps: 0.100
Episode: 1859, Reward: 215.0, avg loss: 0.

Episode: 1980, Reward: 180.0, avg loss: 0.03354, eps: 0.100
Episode: 1981, Reward: 445.0, avg loss: 0.02557, eps: 0.100
Episode: 1982, Reward: 200.0, avg loss: 0.02619, eps: 0.100
Episode: 1983, Reward: 225.0, avg loss: 0.05085, eps: 0.100
Episode: 1984, Reward: 210.0, avg loss: 0.05274, eps: 0.100
Episode: 1985, Reward: 295.0, avg loss: 0.04228, eps: 0.100
Episode: 1986, Reward: 125.0, avg loss: 0.01890, eps: 0.100
Episode: 1987, Reward: 255.0, avg loss: 0.03036, eps: 0.100
Episode: 1988, Reward: 375.0, avg loss: 0.04723, eps: 0.100
Episode: 1989, Reward: 145.0, avg loss: 0.07136, eps: 0.100
Episode: 1990, Reward: 215.0, avg loss: 0.02831, eps: 0.100
Episode: 1991, Reward: 135.0, avg loss: 0.04882, eps: 0.100
Episode: 1992, Reward: 495.0, avg loss: 0.05988, eps: 0.100
Episode: 1993, Reward: 40.0, avg loss: 0.04390, eps: 0.100
Episode: 1994, Reward: 140.0, avg loss: 0.01875, eps: 0.100
Episode: 1995, Reward: 235.0, avg loss: 0.02621, eps: 0.100
Episode: 1996, Reward: 325.0, avg loss: 0

In [15]:
primary_network.save('primary_network.tf')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: primary_network.tf\assets


In [16]:
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

In [17]:
with open('memory.pkl', 'wb') as output:
    pickle.dump(memory, output, -1)