In [15]:
import tensorflow as tf
from tensorflow import keras

In [16]:
input_shape = (80, 80, 4)
inputs = keras.layers.Input(input_shape)
inputs

<KerasTensor: shape=(None, 80, 80, 4) dtype=float32 (created by layer 'input_6')>

In [17]:
from model import CNNBlock, MLPBlock, DQN

In [18]:
cnn_block = CNNBlock(padding='same')
inputs = keras.layers.Input(shape=(80,80,4))
x_conv1 = cnn_block.conv1(inputs)
x_conv1

<KerasTensor: shape=(None, 20, 20, 32) dtype=float32 (created by layer 'conv2d_12')>

In [19]:
x_maxpool1 = cnn_block.maxpool1(x_conv1)
x_maxpool1

<KerasTensor: shape=(None, 10, 10, 32) dtype=float32 (created by layer 'max_pooling2d_12')>

In [20]:
x_conv2 = cnn_block.conv2(x_maxpool1)
x_conv2

<KerasTensor: shape=(None, 5, 5, 64) dtype=float32 (created by layer 'conv2d_13')>

In [21]:
x_conv3 = cnn_block.conv3(x_conv2)
x_conv3

<KerasTensor: shape=(None, 5, 5, 64) dtype=float32 (created by layer 'conv2d_14')>

In [22]:
mlp_block = MLPBlock()
x_mlp = mlp_block(x_conv3)
x_mlp

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'mlp_block')>

In [6]:
optimizer = tf.optimizers.Adam
loss_fn = tf.keras.losses.MeanSquaredError
learning_rate = 0.001

layers = keras.Sequential()
layers.add(CNNBlock())
layers.add(MLPBlock())
layers.compile(optimizer=optimizer(learning_rate=learning_rate),
               loss=loss_fn())

In [7]:
import numpy as np

In [8]:
nparray = np.ones(shape=(80, 80, 4))
tftensor = tf.convert_to_tensor(nparray[None, :], dtype=tf.float32)
output = layers(tftensor)
output

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.01399432, 0.08301523]], dtype=float32)>

In [9]:
action = np.argmax(output.numpy()[0], axis=0)
action

1

In [10]:
samples = np.random.normal(size=(128, 80, 80, 4))
samples = tf.convert_to_tensor(samples, dtype=tf.float32)
outputs = layers(samples)
outputs

<tf.Tensor: shape=(128, 2), dtype=float32, numpy=
array([[ 0.01523719, -0.13569796],
       [-0.07555529, -0.05862848],
       [-0.08316247, -0.18791926],
       [-0.01443708, -0.13372532],
       [-0.16749619,  0.10279229],
       [-0.17586109, -0.04526759],
       [ 0.08472159,  0.20577912],
       [-0.05181425, -0.18839629],
       [-0.12569016, -0.29265094],
       [-0.10964687, -0.16069165],
       [-0.0908488 ,  0.11569434],
       [ 0.1150196 , -0.23111098],
       [-0.07088578, -0.02443665],
       [ 0.06878222,  0.03014604],
       [ 0.10169072, -0.1299534 ],
       [ 0.07117836, -0.24407434],
       [ 0.1360977 , -0.21770853],
       [-0.02557518, -0.06842843],
       [-0.14017436, -0.00423495],
       [ 0.07761571, -0.08022447],
       [ 0.11206621, -0.04884676],
       [ 0.10684249, -0.20514256],
       [ 0.06636943,  0.01582746],
       [-0.05694761, -0.09902373],
       [-0.06459805, -0.21052144],
       [-0.19375464,  0.05235211],
       [-0.11603114, -0.04398154],
     

In [11]:
actions = np.zeros_like(outputs)
actions[np.arange(actions.shape[0]), np.argmax(outputs, axis=1)] = 1
actions


array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.

In [12]:
a = np.zeros((80, 80))
np.expand_dims(a, axis=-1).shape

(80, 80, 1)

In [13]:
import random

In [14]:
all_actions = [np.array([0, 1]), np.array([1, 0])]
choice = random.choice(all_actions)
choice


array([1, 0])

In [15]:
from collections import deque

In [16]:
dq = deque(maxlen=10)
for i in range(20):
    dq.append(i)
print(dq)

deque([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], maxlen=10)


In [17]:
import os

In [18]:
path_checkpoint = "training_1/cp.ckpt"
print(os.path.dirname(path_checkpoint))

training_1


In [19]:
a = np.array(1)
print(a) # 1
max = tf.maximum(tf.cast(1, dtype=tf.float32), tf.cast(2, dtype=tf.float32))
print(max) # ()

1
tf.Tensor(2.0, shape=(), dtype=float32)


In [20]:
action_samples = np.array([[0, 1],
                           [1, 0], 
                           [0, 1], 
                           [1, 0], 
                           [1, 0]])

In [21]:

np.argmax(action_samples[1])

0

In [22]:
q_target = np.random.normal(size=(32, 2))

In [23]:
print(q_target[0][np.argmax([0, 1])])
print(q_target[0][[0, 1]])

0.32285670942897227
[-0.29489507  0.32285671]


In [24]:
q_target[0][[0, 1]] = 1
q_target

array([[ 1.        ,  1.        ],
       [-0.86551256, -1.03837869],
       [ 2.10839018,  0.2444458 ],
       [-0.73294962,  1.82672399],
       [ 0.55823877,  0.33659599],
       [ 1.37722289, -2.15086112],
       [-0.57151783, -0.12494877],
       [-0.55767321, -0.38480572],
       [ 0.3065331 ,  1.04543576],
       [-0.13046008, -2.92437294],
       [-0.50838919,  1.54502128],
       [ 0.04229048, -0.76800025],
       [-0.32318091,  0.28778455],
       [-0.2020337 , -0.42049578],
       [-1.50354077, -0.01762514],
       [-0.11562003,  0.61341493],
       [ 1.018556  ,  1.17389629],
       [ 0.57250868, -0.25930633],
       [ 1.158289  , -0.57246622],
       [ 0.25055596,  1.02759259],
       [ 0.88643104,  0.73369819],
       [-1.06529218,  0.61260403],
       [ 0.73809522, -0.11050408],
       [-0.02174249, -0.47437929],
       [ 0.036627  , -0.26103482],
       [ 0.32694828,  1.41344059],
       [-2.72428514, -0.12167045],
       [-1.31191741, -1.01607194],
       [-0.31609811,

In [25]:
np.zeros([2])

array([0., 0.])

In [53]:
x = np.random.normal(size=(80, 80, 4))
x_2 =  np.random.normal(size=(1, 80, 80, 4))

In [59]:
model = DQN()
model.net(x)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.10924925, -0.17382137]], dtype=float32)>

In [55]:
model.net(x_2)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.03543302, -0.22861898]], dtype=float32)>

In [63]:
x = np.random.normal(size=(1, 80, 80, 4))
model.net(x)
np.max(x)

4.04029761188

In [65]:
model.net.predict(x)



array([[-0.09924804, -0.2676801 ]], dtype=float32)

In [66]:
import argparse

In [72]:
parser = argparse.ArgumentParser(description='Ya - DQN')
parser.add_argument(
    '-p', '--path', help='The path of check point', required=True)
args = parser.parse_args()

usage: ipykernel_launcher.py [-h] -p PATH
ipykernel_launcher.py: error: the following arguments are required: -p/--path


SystemExit: 2

In [120]:
D = deque(maxlen=10)
for i in range(1, 21):
    D.append(i)
D

deque([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [79]:
D.append(22)


In [80]:
D

deque([13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

In [111]:
import random
state_samples = np.random.normal(size=(32, 80, 80, 4))
next_state_samples = np.random.normal(size=(32, 80, 80, 4))
terminate_samples = np.random.choice([True, False], size=32)
action_samples = random.choices([np.array([1, 0]), np.array([0, 1])], k=32)
reward_samples = np.random.normal(size=32)
q_target = model.net(state_samples).numpy()
q_next = model.target_net(next_state_samples)
best_q_next = np.amax(q_next, axis=1)
for i in range(32):
    if terminate_samples[i]:
        q_target[i][np.argmax(action_samples[i])] = reward_samples[i]
    else:
        q_target[i][np.argmax(action_samples[i])] = reward_samples[i] + \
            0.99 * best_q_next[i]


In [115]:
y_pred = model.net(state_samples).numpy()
y_pred

array([[-0.1088215 , -0.18107672],
       [-0.20373952, -0.23932509],
       [-0.09412878,  0.08383213],
       [-0.1655603 ,  0.00148106],
       [-0.22586009, -0.14517355],
       [-0.25561622,  0.1106342 ],
       [-0.09309048, -0.05221125],
       [-0.04660101, -0.30940244],
       [ 0.00956449, -0.15089881],
       [-0.07583976, -0.12498897],
       [-0.04903087, -0.1042817 ],
       [-0.11799715,  0.0734737 ],
       [-0.028555  , -0.05537489],
       [ 0.0299738 , -0.0333958 ],
       [-0.14466742, -0.15505812],
       [ 0.03845222,  0.04587642],
       [ 0.12974994, -0.07707621],
       [-0.03463696, -0.0891631 ],
       [ 0.11360303, -0.09744302],
       [-0.05348741, -0.1091511 ],
       [-0.02692798, -0.1875858 ],
       [-0.23575926, -0.28892064],
       [-0.25076753, -0.13161373],
       [-0.02588608, -0.21660034],
       [ 0.00436256, -0.08090474],
       [-0.2747726 , -0.14395438],
       [-0.13032614,  0.05268465],
       [-0.11031736, -0.1338849 ],
       [-0.2083251 ,

In [116]:
q_target

array([[-1.08821496e-01, -7.84860075e-01],
       [ 1.20184338e+00, -2.39325091e-01],
       [-9.41287801e-02, -1.96499377e-01],
       [-1.65560305e-01,  7.09628820e-01],
       [-1.45232069e+00, -1.45173550e-01],
       [ 1.34915495e+00,  1.10634200e-01],
       [-9.30904821e-02,  1.55284119e+00],
       [-4.66010123e-02, -4.76558208e-01],
       [ 9.56449285e-03, -8.12821567e-01],
       [-4.96379882e-01, -1.24988973e-01],
       [-4.90308665e-02,  9.52343166e-01],
       [ 1.81898940e+00,  7.34736994e-02],
       [-2.85550021e-02,  5.45779884e-01],
       [ 2.99738050e-02, -2.57204890e-01],
       [ 8.70795488e-01, -1.55058116e-01],
       [-4.70444143e-01,  4.58764173e-02],
       [-6.61427557e-01, -7.70762116e-02],
       [ 8.52594793e-01, -8.91631022e-02],
       [ 1.13603026e-01, -4.42830503e-01],
       [ 9.71015096e-01, -1.09151103e-01],
       [-7.66771317e-01, -1.87585801e-01],
       [ 3.52894038e-01, -2.88920641e-01],
       [-2.50767529e-01,  1.11419952e+00],
       [ 3.

In [117]:
y_pred - q_target

array([[ 0.        ,  0.60378337],
       [-1.4055829 ,  0.        ],
       [ 0.        ,  0.2803315 ],
       [ 0.        , -0.70814776],
       [ 1.2264606 ,  0.        ],
       [-1.6047711 ,  0.        ],
       [ 0.        , -1.6050525 ],
       [ 0.        ,  0.16715577],
       [ 0.        ,  0.66192275],
       [ 0.42054012,  0.        ],
       [ 0.        , -1.0566249 ],
       [-1.9369866 ,  0.        ],
       [ 0.        , -0.6011548 ],
       [ 0.        ,  0.2238091 ],
       [-1.0154629 ,  0.        ],
       [ 0.50889635,  0.        ],
       [ 0.7911775 ,  0.        ],
       [-0.88723177,  0.        ],
       [ 0.        ,  0.3453875 ],
       [-1.0245025 ,  0.        ],
       [ 0.73984337,  0.        ],
       [-0.5886533 ,  0.        ],
       [ 0.        , -1.2458133 ],
       [-0.0561659 ,  0.        ],
       [ 0.9218134 ,  0.        ],
       [ 0.        , -0.631935  ],
       [ 0.        ,  1.4412682 ],
       [ 0.        ,  1.4406378 ],
       [ 0.09359698,

In [118]:
mse = tf.keras.losses.MeanSquaredError()
mse(q_target, y_pred).numpy()

0.46834016

In [123]:
mse_2 = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)
mse_2(q_target, y_pred).numpy()

14.986885

In [161]:
model_2 = DQN()
model_2.net(state_samples)
model_2.target_net(state_samples)

model_2.update_target_net()

In [166]:
tf.reduce_all(tf.math.equal(model_2.net(state_samples), model_2.target_net(state_samples)))

tf.random.set_seed(42)
loss = model_2.net.train_on_batch(state_samples, q_target)
target_loss = model_2.target_net.train_on_batch(state_samples, q_target)

In [168]:
loss

2.1912753582000732

In [169]:
target_loss

2.1912753582000732

In [170]:
tf.math.equal(model_2.net(state_samples), model_2.target_net(state_samples))

<tf.Tensor: shape=(32, 2), dtype=bool, numpy=
array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])>

In [173]:
tfvar = tf.Variable(0.1)
init_e = 0.6

if tfvar < 0.6:
    print('sss')

sss


In [180]:
t = tf.Variable(0)
ckpt = tf.train.Checkpoint(
    t=t, 
    model=model
)
t.assign_add(1000000)
print(ckpt.t.numpy())

1000000


In [193]:
state = np.random.normal(size=(3, 4))
next_img = np.random.normal(size=3)
next_img = np.expand_dims(next_img, axis=-1)
next_state = np.append(next_img, state[:, :3], axis=1)

In [194]:
state

array([[ 0.8107364 ,  1.2389503 , -0.53519395,  0.84183891],
       [-0.00627737, -1.55269877,  0.12937113,  0.71980087],
       [ 0.80914968, -0.35635918,  0.20724393, -1.70961252]])

In [195]:
next_state

array([[-0.83243669,  0.8107364 ,  1.2389503 , -0.53519395],
       [ 0.66368603, -0.00627737, -1.55269877,  0.12937113],
       [-0.48064242,  0.80914968, -0.35635918,  0.20724393]])