In [1]:
import numpy as np


def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)


def downsample(img):
    return img[::2, ::2]


def preprocess(img):
    return to_grayscale(downsample(img)).astype("float32")

In [2]:
from collections import deque
import random
import numpy as np


class ReplayMemory:
    def __init__(self, max_length):
        self.memory = deque(maxlen=max_length)

    def add(self, state, action, reward, next_state, terminal):
        self.memory.append([state, action, reward, next_state, terminal])

    def get_batch(self, batch_size):
        print(batch_size, len(self.memory))
        sampling = np.array(random.sample(self.memory, batch_size))
        state_batch = np.stack(sampling[:, 0])
        next_state_batch = np.stack(sampling[:, 3])
        return state_batch, sampling[:, 1], sampling[:, 2], next_state_batch, sampling[:, 4]

    def __len__(self):
        return len(self.memory)

In [3]:
from __future__ import print_function

import random

import numpy as np

import tensorflow as tf
import tensorflow.contrib.eager as tfe

# eager execution
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)


In [81]:

# Hyper parameter
INITIAL_EPSILON = 1.0  # initial exploration rate
FINAL_EPSILON = 0.1  # final exploration rate
LEARNING_RATE = 0.001  # learning rate
OBSERVATION_STEPS = 1  # step for observing(not trainig)
EXPLORATION_STEPS = 500000  # step for exploration(epsilon > FINAL_EPSILON)
BATCH_SIZE = 1  # batch size
GAMMA = 0.95  # discount rate


class DQNAgent(tf.keras.Model):
    def __init__(self, state_shape, action_dim, checkpoint_directory, batch_size=BATCH_SIZE, device_name='cpu:0'):
        super(DQNAgent, self).__init__()
        # state's shape , in Atari we will use (-1, 105, 80, 1)
        self.state_shape = state_shape
        # number of actions, in Atari 4
        self.action_dim = action_dim

        # saving checkpoint directory
        self.checkpoint_directory = checkpoint_directory

        # init q layers
        self.conv1 = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1 = tf.layers.BatchNormalization()
        self.conv2 = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2 = tf.layers.BatchNormalization()
        self.conv3 = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten = tf.layers.Flatten()

        self.dense1 = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2 = tf.layers.Dense(action_dim, activation=None)

        self.base_layers = [self.conv1, self.batch1, self.conv2, self.batch2, self.conv3, self.flatten, self.dense1,
                            self.dense2]

        # target q layers
        self.conv1_t = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1_t = tf.layers.BatchNormalization()
        self.conv2_t = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2_t = tf.layers.BatchNormalization()
        self.conv3_t = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten_t = tf.layers.Flatten()

        self.dense1_t = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2_t = tf.layers.Dense(action_dim, activation=None)

        self.target_layers = [self.conv1_t, self.batch1_t, self.conv2_t, self.batch2_t, self.conv3_t, self.flatten_t,
                              self.dense1_t, self.dense2_t]

        # learning optimizer
        self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

        # epsilon-greedy
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS

        # replay_memory
        self.replay_memory = ReplayMemory(1000000)
        self.batch_size = batch_size

        # for logging
        self.step_count = 0

        # device configuration
        self.device_name = device_name

    def predict(self, state_batch, training):

        # you can use prediction with numpy array state input
        if isinstance(state_batch, (np.ndarray, np.generic)):
            state_batch = np.reshape(state_batch, self.state_shape)
            state_batch = tf.convert_to_tensor(state_batch)

        x = self.conv1(state_batch)
        x = self.batch1(x, training=training)
        x = self.conv2(x)
        x = self.batch2(x, training=training)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)

        return x

    def predict_target(self, state_batch, training):

        # you can use prediction with numpy array state input
        if isinstance(state_batch, (np.ndarray, np.generic)):
            state_batch = np.reshape(state_batch, self.state_shape)
            state_batch = tf.convert_to_tensor(state_batch)

        x = self.conv1_t(state_batch)
        x = self.batch1_t(x, training=training)
        x = self.conv2_t(x)
        x = self.batch2_t(x, training=training)
        x = self.conv3_t(x)
        x = self.flatten_t(x)
        x = self.dense1_t(x)
        x = self.dense2_t(x)

        return x

    def copy_base_to_target(self):
        """copy base's weights to target"""
        for idx_layer in range(len(self.base_layers)):
            base = self.base_layers[idx_layer]
            target = self.target_layers[idx_layer]
            for idx_weight in range(len(base.weights)):
                tf.assign(target.weights[idx_weight], base.weights[idx_weight])
            if hasattr(base, "bias"):
                tf.assign(target.bias, base.bias)

    @staticmethod
    def huber_loss(labels, predictions):
        error = labels - predictions
        quadratic_term = error * error / 2
        linear_term = abs(error) - 1 / 2
        use_linear_term = tf.convert_to_tensor((abs(error) > 1.0).numpy().astype("float32"))

        return use_linear_term * linear_term + (1 - use_linear_term) * quadratic_term

    def loss(self, state_batch, target, training):
        predictoins = self.predict(state_batch, training)
        # loss_value = tf.losses.mean_squared_error(labels=target, predictions=predictoins)
        loss_value = tf.reduce_sum(self.huber_loss(labels=target, predictions=predictoins))
        return loss_value

    def grad(self, state_batch, target, training):
        with tfe.GradientTape() as tape:
            loss_value = self.loss(state_batch, target, training)
        return tape.gradient(loss_value, self.variables)

    def get_action(self, state, training=False):
        if training:
            if self.epsilon >= random.random():
                action = tf.convert_to_tensor(random.randrange(self.action_dim))
            else:
                action = tf.argmax(self.predict(state, training=training), 1)

            if self.epsilon > FINAL_EPSILON and self.step_count > OBSERVATION_STEPS:
                self.epsilon -= self.epsilon_step

            return action

        else:
            return tf.argmax(self.predict(state, training=training), 1)

    def step(self, state, action, reward, next_state, terminal):
        if self.step_count <= OBSERVATION_STEPS:
            self.observe(state, action, reward, next_state, terminal)
        else:
            self.fit(state, action, reward, next_state, terminal)

        if self.step_count % 1000 == 0:
            print("STEP %s : EPSILON [%6f]...." % (self.step_count, self.epsilon))
        self.step_count += 1

    def observe(self, state, action, reward, next_state, terminal):
        self.replay_memory.add(state, action, reward, next_state, terminal)

    def fit(self, state, action, reward, next_state, terminal, num_epochs=1):

        self.replay_memory.add(state, action, reward, next_state, terminal)

        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.replay_memory.get_batch(
            self.batch_size)

        
        current_q = self.predict(state_batch, training=False).numpy()

        # method 1 : maintain direction for original q values
        now_q = current_q.copy() * 0.75

        # method 2 : use zero value actions other than selected action
#         now_q = np.zeros((self.batch_size,self.action_dim))

        target_q_batch = self.predict_target(next_state_batch, training=False)

        y_batch = reward_batch + (1 - terminal_batch) * GAMMA * np.max(target_q_batch, axis=1)

        for i in range(self.batch_size):
            now_q[i, action_batch[i]] = y_batch[i]
            
            
        print("original", current_q[0])
        print("target_q", now_q[0])
        print("action", action_batch[0])

        print(self.loss(state_batch, now_q, False))

        with tf.device(self.device_name):
            for i in range(num_epochs):
                grads = self.grad(state_batch, now_q, True)
                self.optimizer.apply_gradients(zip(grads, self.variables))
        
        print(self.loss(state_batch, now_q, False))

                
        print("after", self.predict(state_batch, training=False).numpy()[0])

    def save(self, global_step=0):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=global_step)

    def load_last_checkpoint(self):
        # Run the model once to initialize variables
        dummy_input = tf.constant(tf.zeros(self.state_shape))
        dummy_pred = self.predict(dummy_input, training=False)
        # Restore the variables of the model
        saver = tfe.Saver(self.variables)
        saver.restore(tf.train.latest_checkpoint
                      (self.checkpoint_directory))


In [82]:
import gym
env = gym.make('Breakout-v0')
agent = DQNAgent(state_shape=(-1, 105, 80, 1), action_dim=4,
                 checkpoint_directory="./models_checkpoints/rl/", batch_size=1,
                 device_name="gpu:0")

observation = env.reset()
total_reward = 0



In [145]:
for i in range(500):
    now_state= preprocess(observation)
    action = agent.get_action(now_state, training=True).numpy()
    observation, reward, done, info = env.step(action)
    if(done):
        done = 1
        env.reset()
    else:
        done = 0
    next_state= preprocess(observation)
    agent.step(now_state, action, reward, next_state, done)

(1, 2050)
original [145886.36  119474.266 125141.266  98259.63 ]
target_q [109414.766  89605.7   146612.75   73694.73 ]
action 2
tf.Tensor(112376.55, shape=(), dtype=float32)
tf.Tensor(114945.28, shape=(), dtype=float32)
after [147445.81 120774.57 126470.29  99297.63]
(1, 2051)
original [127056.875 104053.64  108963.85   85558.15 ]
target_q [ 95292.66   78040.234  81722.89  117437.94 ]
action 3
tf.Tensor(116898.375, shape=(), dtype=float32)
tf.Tensor(118348.91, shape=(), dtype=float32)
after [127779.04 104661.22 109574.7   86048.21]
(1, 2052)
original [142994.6   117147.484 122643.2    96303.805]
target_q [107245.945  87860.61   91982.41  132589.58 ]
action 3
tf.Tensor(131982.1, shape=(), dtype=float32)
tf.Tensor(132501.81, shape=(), dtype=float32)
after [143263.12  117380.016 122862.37   96504.305]
(1, 2053)
original [136251.02 111628.15 116842.59  91778.48]
target_q [102188.266  83721.11   87631.945 132483.56 ]
action 3
tf.Tensor(131885.53, shape=(), dtype=float32)
tf.Tensor(131032.0

tf.Tensor(137089.6, shape=(), dtype=float32)
after [147799.98 121207.73 125841.02  99530.96]
(1, 2089)
original [133607.8   109563.836 113748.74   89971.54 ]
target_q [100205.84   82172.875  85311.555 141268.6  ]
action 3
tf.Tensor(140527.16, shape=(), dtype=float32)
tf.Tensor(142636.28, shape=(), dtype=float32)
after [134673.64 110444.27 114627.07  90687.03]
(1, 2090)
original [140539.03 115252.49 119617.26  94636.69]
target_q [105404.27   86439.37  131979.31   70977.516]
action 2
tf.Tensor(99969.11, shape=(), dtype=float32)
tf.Tensor(100915.48, shape=(), dtype=float32)
after [141111.11  115725.23  120095.195  95016.18 ]
(1, 2091)
original [131209.6   107603.53  111665.016  88347.68 ]
target_q [ 98407.195 127990.31   83748.766  66260.76 ]
action 1
tf.Tensor(103192.35, shape=(), dtype=float32)
tf.Tensor(102902.195, shape=(), dtype=float32)
after [131061.01  107500.836 111528.61   88239.82 ]
(1, 2092)
original [142766.94  117113.08  121506.83   96125.055]
target_q [107075.2    87834.81 

after [94901.17  77782.445 80997.99  63823.184]
(1, 2125)
original [114986.71   94268.18   98167.805  77339.99 ]
target_q [130249.08   70701.13   73625.85   58004.992]
action 0
tf.Tensor(82706.37, shape=(), dtype=float32)
tf.Tensor(79864.945, shape=(), dtype=float32)
after [112896.13   92535.24   96378.555  75930.18 ]
(1, 2126)
original [109506.94  89760.15  93487.49  73651.47]
target_q [128490.42  67320.11  70115.62  55238.6 ]
action 0
tf.Tensor(83208.266, shape=(), dtype=float32)
tf.Tensor(82966.94, shape=(), dtype=float32)
after [109337.53  89608.1   93342.93  73537.34]
(1, 2127)
original [101022.586  82792.87   86242.6    67943.78 ]
target_q [ 75766.94   62094.65   64681.953 133349.88 ]
action 3
tf.Tensor(132920.61, shape=(), dtype=float32)
tf.Tensor(132873.23, shape=(), dtype=float32)
after [101009.81   82770.02   86230.945  67943.88 ]
(1, 2128)
original [99593.61 81606.94 85020.41 66990.04]
target_q [130015.16   61205.203  63765.305  50242.53 ]
action 0
tf.Tensor(88825.89, shape=

tf.Tensor(111995.19, shape=(), dtype=float32)
after [98384.7  79981.77 83853.36 66146.7 ]
(1, 2161)
original [96818.42 78691.37 82506.71 65091.29]
target_q [ 72613.81   59018.523  61880.03  134904.36 ]
action 3
tf.Tensor(134317.2, shape=(), dtype=float32)
tf.Tensor(129917.945, shape=(), dtype=float32)
after [94625.61 76897.95 80620.57 63618.18]
(1, 2162)
original [115295.28  93743.27  98266.05  77523.39]
target_q [ 86471.46  70307.45  73699.53 128364.56]
action 3
tf.Tensor(127667.33, shape=(), dtype=float32)
tf.Tensor(121283.98, shape=(), dtype=float32)
after [112116.36   91143.38   95536.17   75398.055]
(1, 2163)
original [109155.13   88745.836  93019.79   73408.51 ]
target_q [ 81866.35  135399.1    69764.84   55056.383]
action 1
tf.Tensor(115549.11, shape=(), dtype=float32)
tf.Tensor(111310.28, shape=(), dtype=float32)
after [106684.61   86737.47   90896.375  71755.25 ]
(1, 2164)
original [91209.21  74106.086 77675.2   61338.387]
target_q [ 68406.91   55579.562 136844.19   46003.79 ]

tf.Tensor(132955.11, shape=(), dtype=float32)
after [73081.89  59545.426 62129.58  49200.473]
(1, 2197)
original [79631.08  64883.688 67700.35  53610.945]
target_q [134369.55   48662.766  50775.266  40208.21 ]
action 0
tf.Tensor(101287.21, shape=(), dtype=float32)
tf.Tensor(102582.36, shape=(), dtype=float32)
after [80587.47 65672.52 68514.02 54259.98]
(1, 2198)
original [68197.24  55552.098 57960.65  45912.465]
target_q [126296.77   41664.074  43470.484  34434.348]
action 0
tf.Tensor(97955.836, shape=(), dtype=float32)
tf.Tensor(98625.805, shape=(), dtype=float32)
after [68702.52  55964.805 58384.137 46251.516]
(1, 2199)
original [68241.22  55578.723 57981.453 45939.266]
target_q [ 51180.914  41684.043  43486.09  132921.14 ]
action 3
tf.Tensor(132432.22, shape=(), dtype=float32)
tf.Tensor(132508.72, shape=(), dtype=float32)
after [68285.76  55613.8   58012.14  45973.074]
(1, 2200)
original [86896.234 70842.31  73887.83  58517.   ]
target_q [ 65172.176  53131.734 133531.12   43887.75 ]

original [75672.22  61468.105 64015.176 50715.56 ]
target_q [ 56754.164  46101.08   48011.383 137008.98 ]
action 3
tf.Tensor(136582.3, shape=(), dtype=float32)
tf.Tensor(137255.1, shape=(), dtype=float32)
after [76008.31  61740.51  64292.145 50928.227]
(1, 2236)
original [55886.074 45326.586 47201.18  37415.61 ]
target_q [ 41914.555 128740.055  35400.883  28061.707]
action 1
tf.Tensor(118539.19, shape=(), dtype=float32)
tf.Tensor(118077.08, shape=(), dtype=float32)
after [55626.957 45118.84  46974.59  37231.457]
(1, 2237)
original [74225.75  60290.54  62769.86  49719.145]
target_q [136857.78   45217.906  47077.395  37289.36 ]
action 0
tf.Tensor(105826.914, shape=(), dtype=float32)
tf.Tensor(106701.39, shape=(), dtype=float32)
after [74889.02  60836.215 63326.953 50154.125]
(1, 2238)
original [68267.25  55437.395 57705.78  45710.11 ]
target_q [132999.16   41578.047  43279.336  34282.582]
action 0
tf.Tensor(104445.22, shape=(), dtype=float32)
tf.Tensor(104740.96, shape=(), dtype=float32)

tf.Tensor(124715.625, shape=(), dtype=float32)
tf.Tensor(125769.22, shape=(), dtype=float32)
after [54166.844 43926.758 45407.29  36023.74 ]
(1, 2273)
original [56184.047 45549.67  47080.848 37356.8  ]
target_q [ 42138.035  34162.254 153255.45   28017.602]
action 2
tf.Tensor(140947.25, shape=(), dtype=float32)
tf.Tensor(143148.03, shape=(), dtype=float32)
after [57525.906 46644.27  48211.68  38251.973]
(1, 2274)
original [63393.113 51431.895 53172.895 42175.297]
target_q [130960.516  38573.92   39879.67   31631.473]
action 0
tf.Tensor(104262.42, shape=(), dtype=float32)
tf.Tensor(105842.94, shape=(), dtype=float32)
after [64594.758 52405.848 54184.76  42971.637]
(1, 2275)
original [66927.03  54306.164 56153.004 44527.61 ]
target_q [ 50195.273  40729.625 146612.75   33395.707]
action 2
tf.Tensor(131899.95, shape=(), dtype=float32)
tf.Tensor(133305.66, shape=(), dtype=float32)
after [67789.375 55004.95  56879.883 45099.074]
(1, 2276)
original [66097.664 53637.844 55467.605 43977.434]
tar

after [47017.812 37941.03  39235.676 31017.84 ]
(1, 2310)
original [47877.6   38631.523 39948.93  31580.879]
target_q [137284.97   28973.643  29961.697  23685.66 ]
action 0
tf.Tensor(116947.7, shape=(), dtype=float32)
tf.Tensor(120310.51, shape=(), dtype=float32)
after [50420.957 40704.48  42092.957 33270.055]
(1, 2311)
original [32635.588 26243.105 27107.727 21450.914]
target_q [ 24476.691  19682.328 127453.14   16088.186]
action 2
tf.Tensor(120427.81, shape=(), dtype=float32)
tf.Tensor(122784.45, shape=(), dtype=float32)
after [34077.457 27418.98  28325.975 22408.053]
(1, 2312)
original [49020.938 39571.047 40918.176 32338.65 ]
target_q [ 36765.703  29678.285  30688.633 127420.8  ]
action 3
tf.Tensor(127459.69, shape=(), dtype=float32)
tf.Tensor(129050.81, shape=(), dtype=float32)
after [49818.17  40222.016 41592.773 32870.332]
(1, 2313)
original [59178.895 47835.05  49483.863 39093.645]
target_q [137011.52   35876.29   37112.9    29320.234]
action 0
tf.Tensor(111935.766, shape=(), d

tf.Tensor(126060.25, shape=(), dtype=float32)
after [18809.832 14791.979 15263.253 11990.048]
(1, 2346)
original [15344.911  12023.7295 12404.234   9749.202 ]
target_q [ 11508.684  126472.195    9303.176    7311.9014]
action 1
tf.Tensor(123823.055, shape=(), dtype=float32)
tf.Tensor(121462.305, shape=(), dtype=float32)
after [13947.983 10888.948 11229.707  8825.131]
(1, 2347)
original [57107.11  45812.633 47457.926 37262.445]
target_q [ 42830.332 135843.88   35593.445  27946.834]
action 1
tf.Tensor(125488.11, shape=(), dtype=float32)
tf.Tensor(125258.18, shape=(), dtype=float32)
after [56975.21  45699.426 47343.684 37165.445]
(1, 2348)
original [48375.63  38746.957 40130.074 31503.662]
target_q [ 36281.723 137579.03   30097.555  23627.746]
action 1
tf.Tensor(128834.42, shape=(), dtype=float32)
tf.Tensor(129470.97, shape=(), dtype=float32)
after [48754.97  39049.65  40446.816 31746.83 ]
(1, 2349)
original [19225.414 15124.868 15615.444 12257.557]
target_q [ 14419.061  11343.651  11711.5

tf.Tensor(121949.445, shape=(), dtype=float32)
after [20130.324 15922.9   16473.441 12876.493]
(1, 2382)
original [14622.159 11498.477 11885.72   9289.318]
target_q [ 10966.619   8623.857   8914.29  124891.8  ]
action 3
tf.Tensor(125104.06, shape=(), dtype=float32)
tf.Tensor(122099.12, shape=(), dtype=float32)
after [13114.361 10278.962 10617.202  8298.434]
(1, 2383)
original [50200.957 40355.6   41864.324 32759.008]
target_q [ 37650.72  135914.05   31398.242  24569.256]
action 1
tf.Tensor(126764.52, shape=(), dtype=float32)
tf.Tensor(123056.516, shape=(), dtype=float32)
after [48012.902 38591.605 40022.832 31316.56 ]
(1, 2384)
original [5072.117  3743.5088 3836.8596 3006.0967]
target_q [  3804.088    2807.6316   2877.6448 133929.06  ]
action 3
tf.Tensor(134086.1, shape=(), dtype=float32)
tf.Tensor(134120.03, shape=(), dtype=float32)
after [3135.8423 2180.92   2225.196  1756.4362]
(1, 2385)
original [44761.406 35960.137 37280.45  29170.943]
target_q [ 33571.055 136863.97   27960.336  2

tf.Tensor(124975.32, shape=(), dtype=float32)
after [54239.32  43774.2   45228.4   35456.785]
(1, 2421)
original [51891.117 41849.28  43228.223 33889.33 ]
target_q [ 38918.336 137769.73   32421.168  25416.996]
action 1
tf.Tensor(128172.625, shape=(), dtype=float32)
tf.Tensor(128934.625, shape=(), dtype=float32)
after [52343.05  42223.746 43610.69  34191.395]
(1, 2422)
original [55882.773 45106.695 46599.203 36533.836]
target_q [ 41912.08  154473.6    34949.402  27400.377]
action 1
tf.Tensor(144120.86, shape=(), dtype=float32)
tf.Tensor(144135.03, shape=(), dtype=float32)
after [55896.406 45129.5   46611.47  36544.926]
(1, 2423)
original [39588.46  31860.617 32860.55  25765.213]
target_q [ 29691.346  23895.463 130951.43   19323.91 ]
action 2
tf.Tensor(122394.45, shape=(), dtype=float32)
tf.Tensor(122032.7, shape=(), dtype=float32)
after [39363.918 31685.559 32674.932 25617.445]
(1, 2424)
original [25955.92  20793.676 21396.35  16775.838]
target_q [ 19466.94   15595.257 130929.195  12581

tf.Tensor(132158.38, shape=(), dtype=float32)
after [-128.48712 -327.10886 -412.4831  -329.89606]
(1, 2460)
original [69353.61  56256.56  58137.535 45510.816]
target_q [ 52015.207 123821.68   43603.152  34133.113]
action 1
tf.Tensor(110815.62, shape=(), dtype=float32)
tf.Tensor(109157.22, shape=(), dtype=float32)
after [68369.04  55458.957 57314.125 44862.81 ]
(1, 2461)
original [-135.65324 -348.45532 -430.3803  -346.4371 ]
target_q [135202.42      -261.3415    -322.78522   -259.82782]
action 0
tf.Tensor(135619.39, shape=(), dtype=float32)
tf.Tensor(135648.17, shape=(), dtype=float32)
after [-144.2337  -353.8742  -437.75134 -353.84134]
(1, 2462)
original [39075.504 31589.396 32592.344 25501.771]
target_q [ 29306.629 141060.08   24444.258  19126.328]
action 1
tf.Tensor(133763.08, shape=(), dtype=float32)
tf.Tensor(133891.45, shape=(), dtype=float32)
after [39150.637 31652.396 32658.639 25551.71 ]
(1, 2463)
original [34114.184 27539.715 28394.291 22212.09 ]
target_q [ 25585.637  20654.78

tf.Tensor(108834.79, shape=(), dtype=float32)
after [58645.375 47707.656 49153.242 38596.645]
(1, 2497)
original [ -42.467636 -249.64383  -379.7105   -285.44458 ]
target_q [-3.1850727e+01  1.2673389e+05 -2.8478290e+02 -2.1408344e+02]
action 1
tf.Tensor(127160.44, shape=(), dtype=float32)
tf.Tensor(127576.125, shape=(), dtype=float32)
after [-165.04407 -348.1892  -487.648   -372.07086]
(1, 2498)
original [52735.3   42876.75  44159.195 34676.754]
target_q [ 39551.477 133386.75   33119.4    26007.566]
action 1
tf.Tensor(123402.81, shape=(), dtype=float32)
tf.Tensor(119518.625, shape=(), dtype=float32)
after [50446.426 41006.16  42217.664 33152.383]
(1, 2499)
original [19261.715 15511.387 15880.641 12474.724]
target_q [132957.1    11633.54   11910.48    9356.043]
action 0
tf.Tensor(124662.06, shape=(), dtype=float32)
tf.Tensor(122046.42, shape=(), dtype=float32)
after [17301.848  13906.1875 14216.715  11168.329 ]
(1, 2500)
original [1004.0895   591.4397   481.1286   412.92026]
target_q [12

tf.Tensor(141390.17, shape=(), dtype=float32)
after [46652.895 37873.664 39038.94  30546.676]
(1, 2534)
original [6915.5806 5420.095  5472.661  4268.3945]
target_q [  5186.6855   4065.0713   4104.496  130207.79  ]
action 3
tf.Tensor(130391.48, shape=(), dtype=float32)
tf.Tensor(132720.89, shape=(), dtype=float32)
after [8080.57  6371.689 6452.903 5035.804]
(1, 2535)
original [1288.5483   844.7026   780.67566  613.50256]
target_q [   966.41125 126836.414      585.5067     460.12692]
action 1
tf.Tensor(126662.4, shape=(), dtype=float32)
tf.Tensor(127456.53, shape=(), dtype=float32)
after [1764.9172  1235.4711  1177.1621   925.55334]
(1, 2536)
original [37064.69  30032.01  30914.533 24186.264]
target_q [ 27798.52   22524.008 131940.14   18139.697]
action 2
tf.Tensor(123846.34, shape=(), dtype=float32)
tf.Tensor(123461.58, shape=(), dtype=float32)
after [36828.652 29842.605 30719.63  24032.037]
(1, 2537)
original [7991.885  6298.9546 6359.672  4962.9844]
target_q [  5993.9136   4724.216  1

In [142]:
now_state= preprocess(observation)
action = agent.get_action(now_state, training=True).numpy()
observation, reward, done, info = env.step(action)
if(done):
    done = 1
    env.reset()
else:
    done = 0
next_state= preprocess(observation)
agent.step(now_state, action, reward, next_state, done)

(1, 2049)
original [139702.53  114390.664 119849.75   94112.21 ]
target_q [152190.31  85793.    89887.31  70584.16]
action 0
tf.Tensor(94575.94, shape=(), dtype=float32)
tf.Tensor(95747.09, shape=(), dtype=float32)
after [140576.61 115127.32 120587.15  94683.39]


In [151]:
agent.conv1.bias

<tf.Variable 'conv2d_6/bias:0' shape=(32,) dtype=float32, numpy=
array([-0.32756832, -0.1950604 , -0.21648353, -0.36179173,  0.10574724,
        0.21295096, -0.16353884, -0.13122225, -0.0934623 , -0.25070706,
       -0.20482333, -0.10474483, -0.17917714, -0.14940403, -0.20668572,
       -0.10768495, -0.04763309, -0.27938056,  0.21363269, -0.2580154 ,
        0.27371955,  0.09452855, -0.21097869, -0.30777   ,  0.1121348 ,
       -0.22739276, -0.17256896,  0.1737689 ,  0.08141409, -0.24401872,
        0.24518709,  0.3837577 ], dtype=float32)>

In [152]:
agent.conv1_t.bias

<tf.Variable 'conv2d_9/bias:0' shape=(32,) dtype=float32, numpy=
array([-0.32756832, -0.1950604 , -0.21648353, -0.36179173,  0.10574724,
        0.21295096, -0.16353884, -0.13122225, -0.0934623 , -0.25070706,
       -0.20482333, -0.10474483, -0.17917714, -0.14940403, -0.20668572,
       -0.10768495, -0.04763309, -0.27938056,  0.21363269, -0.2580154 ,
        0.27371955,  0.09452855, -0.21097869, -0.30777   ,  0.1121348 ,
       -0.22739276, -0.17256896,  0.1737689 ,  0.08141409, -0.24401872,
        0.24518709,  0.3837577 ], dtype=float32)>

In [153]:
agent.conv1.weights

[<tf.Variable 'conv2d_6/kernel:0' shape=(8, 8, 1, 32) dtype=float32, numpy=
 array([[[[-0.06322311, -0.05104713, -0.01655124, ..., -0.10289399,
           -0.12089036,  0.05278938]],
 
         [[-0.08422735, -0.08830922,  0.00339418, ..., -0.0574326 ,
            0.02315141,  0.20206378]],
 
         [[-0.17067887, -0.10021568, -0.04156078, ..., -0.15047398,
            0.01072003,  0.18293093]],
 
         ...,
 
         [[-0.07143264, -0.08862118, -0.15066926, ..., -0.06335142,
           -0.14472537, -0.12822069]],
 
         [[-0.01529066, -0.08579887, -0.15929799, ..., -0.0941621 ,
           -0.20622651, -0.10355239]],
 
         [[-0.03972084, -0.09820331, -0.1540581 , ..., -0.12341182,
           -0.1485434 , -0.09861366]]],
 
 
        [[[-0.05029557, -0.10313509, -0.01036505, ..., -0.07386825,
           -0.05761741,  0.08043072]],
 
         [[-0.09710483, -0.05559687, -0.02628546, ..., -0.14889568,
            0.00399434,  0.19434099]],
 
         [[-0.11637334, -0.074298

In [154]:
agent.conv1_t.weights

[<tf.Variable 'conv2d_9/kernel:0' shape=(8, 8, 1, 32) dtype=float32, numpy=
 array([[[[-0.06322311, -0.05104713, -0.01655124, ..., -0.10289399,
           -0.12089036,  0.05278938]],
 
         [[-0.08422735, -0.08830922,  0.00339418, ..., -0.0574326 ,
            0.02315141,  0.20206378]],
 
         [[-0.17067887, -0.10021568, -0.04156078, ..., -0.15047398,
            0.01072003,  0.18293093]],
 
         ...,
 
         [[-0.07143264, -0.08862118, -0.15066926, ..., -0.06335142,
           -0.14472537, -0.12822069]],
 
         [[-0.01529066, -0.08579887, -0.15929799, ..., -0.0941621 ,
           -0.20622651, -0.10355239]],
 
         [[-0.03972084, -0.09820331, -0.1540581 , ..., -0.12341182,
           -0.1485434 , -0.09861366]]],
 
 
        [[[-0.05029557, -0.10313509, -0.01036505, ..., -0.07386825,
           -0.05761741,  0.08043072]],
 
         [[-0.09710483, -0.05559687, -0.02628546, ..., -0.14889568,
            0.00399434,  0.19434099]],
 
         [[-0.11637334, -0.074298

In [150]:
agent.copy_base_to_target()

In [8]:

#             return action

# for i_episode in range(10000):
#     observation = env.reset()
#     total_reward = 0
#     for t in range(10000000):
# #         env.render()
#         now_state= preprocess(observation)
#         action = agent.get_action(now_state, training=True).numpy()
#         observation, reward, done, info = env.step(action)
#         if(done):
#             done = 1
#         else:
#             done = 0
#         next_state= preprocess(observation)
#         agent.step(now_state, action, reward, next_state, done)
#         total_reward += reward
#         if done:
#             if agent.step_count > 5000:
#                 agent.copy_base_to_target()

#             if i_episode % 50 == 0:
#                 print("Episode {} finished after {} timesteps".format(i_episode,t+1))
#                 print("reward: %d" % total_reward)
#                 print("epsilon: %s"% agent.epsilon)
#                 if agent.step_count > OBSERVATION_STEPS:
#                     agent.save(i_episode)
#             break