In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

In [2]:
import numpy as np
from collections import deque
import random

In [3]:
## Hyper parameter
INITIAL_EPSILON = 1
FINAL_EPSILON = 0.05
LEARNING_RATE = 5e-1
EXPLORATION_STEPS = 10000
BATCH_SIZE = 32
GAMMA = 0.95

In [4]:
ttt=[]

In [5]:
class ReplayMemory:
    def __init__(self, max_length):
        self.memory = deque(maxlen=max_length)
        
    def add(self, state, action, reward, next_state, terminal):
        self.memory.append([state,action,reward, next_state, terminal])
        
    def get_batch(self, batch_size):
        sampling = random.sample(self.memory, batch_size)
        temp = tf.concat([sampling[i][0] for i in range(batch_size)], axis=0)
        global ttt
        ttt= sampling
        dq = np.array(sampling[1:])
        return temp,tf.convert_to_tensor(dq[:,0]),tf.convert_to_tensor(dq[:,1]), tf.convert_to_tensor(dq[:,2]), tf.convert_to_tensor(dq[:,3])
        
    def __len__(self):
        return len(self.memory)

In [6]:
class DQNAgent(tf.keras.Model):
    def __init__(self, state_shape, action_dim, checkpoint_directory, batch_size=32):
        super(DQNAgent, self).__init__()
        self.state_shape = state_shape
        self.action_dim = action_dim

        self.checkpoint_directory = checkpoint_directory

        
        # init q layers
        self.conv1 = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1 = tf.layers.BatchNormalization()
        self.conv2 = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2 = tf.layers.BatchNormalization()
        self.conv3 = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten = tf.layers.Flatten()
        
        self.dense1 = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2 = tf.layers.Dense(action_dim, activation=None)
        
        
        # learning optimizer
        self.optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)

        # epsilon-greedy
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORATION_STEPS
        
        # replay_memory
        self.replay_memory = ReplayMemory(400)
        
        self.batch_size = batch_size
        
    def predict(self, state_batch, training):
        x = self.conv1(state_batch)
        x = self.batch1(x, training=training)        
        x = self.conv2(x)
        x = self.batch2(x, training=training)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)
        
        return x
    
    def loss(self, state_batch, target, training):
        preds = self.predict(state_batch, training)
        loss_value = tf.losses.sparse_softmax_cross_entropy(labels=target, logits=preds)
        return loss_value
    
    def grad(self, state_batch, target, training):
        with tfe.GradientTape() as tape:
            loss_value = self.loss(state_batch, target, training)
        return tape.gradient(loss_value, self.variables)
        
    
    def get_action(self, state, training=False):
        if training:
            if self.epsilon >= random.random():
                action = tf.convert_to_tensor(random.randrange(self.action_dim))
            else:
                action = tf.argmax(self.predict(state, training=training),0)    
            
            if self.epsilon > FINAL_EPSILON:
                self.epsilon -= self.epsilon_step    
            
            print("here", action.numpy())
            return action 
        
        else:
            return tf.argmax(self.predict(state, training=training),0)    
        
    
    def fit(self, state, action, reward, next_state, terminal, num_epochs=10):
        
        self.replay_memory.add(state,action,reward,next_state,terminal)
        
        if len(self.replay_memory)<self.batch_size:
            return
        
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.replay_memory.get_batch(self.batch_size)
        
        target_q_batch = self.predict(next_state_batch, training=False)
        
        y_batch = reward_batch + (1-terminal_batch) * GAMMA * np.max(target_q_batch, axis = 1)
        
        
        for i in range(num_epochs):
            grads = self.grad(state_batch, y_batch, True)
            self.optimizer.apply_gradients(zip(grads, self.variables))
                    
    
    def save(self, global_step=0):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=global_step)
        
        
    def load(self):
        # Run the model once to initialize variables
        dummy_input = tf.constant(tf.zeros(state_shape))
        dummy_pred = self.predict(dummy_input, training=False)
        # Restore the variables of the model
        saver = tfe.Saver(self.variables)
        saver.restore(tf.train.latest_checkpoint
                      (self.checkpoint_directory))

In [7]:
import gym
env = gym.make('Breakout-v0')
agent = DQNAgent(state_shape=(210, 160, 3), action_dim=4, checkpoint_directory="./models_checkpoints/", batch_size=32)
with tf.device("gpu:0"):
    for i_episode in range(20):
        observation = env.reset()
        total_reward = 0
        for t in range(100):
            env.render()
            now_state= tf.convert_to_tensor(observation.astype("float32").reshape(-1,210,160,3))
            action = agent.get_action(now_state, training=True).numpy()
            observation, reward, done, info = env.step(action)
            done = 1 if done == True else 0
            next_state= tf.convert_to_tensor(observation.astype("float32").reshape(-1,210,160,3))
            agent.fit(now_state, action, reward, next_state, done)
            total_reward += reward
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                print("reward: %d" % reward)
                break

('here', 1)
('here', 2)
('here', 2)
('here', 0)
('here', 1)
('here', 3)
('here', 2)
('here', 0)
('here', 0)
('here', 2)
('here', 3)
('here', 2)
('here', 3)
('here', 3)
('here', 1)
('here', 2)
('here', 2)
('here', 1)
('here', 3)
('here', 0)
('here', 0)
('here', 1)
('here', 3)
('here', 1)
('here', 3)
('here', 2)
('here', 0)
('here', 0)
('here', 2)
('here', 0)
('here', 0)
('here', 0)


ValueError: setting an array element with a sequence.

In [None]:
tf.concat((temp[0][0], temp[1][0]), axis=0)

In [None]:
agent.predict(temp, training=True).numpy()

In [15]:
np.array(ttt)

ValueError: setting an array element with a sequence.