In [1]:
from __future__ import print_function

import tensorflow as tf
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

In [2]:
import numpy as np
from collections import deque
import random

In [3]:
## Hyper parameter
INITIAL_EPSILON = 1.0
FINAL_EPSILON = 0.05
LEARNING_RATE = 0.001
EXPLORATION_STEPS = 1000
BATCH_SIZE = 32
GAMMA = 0.95

In [4]:
ttt=[]

In [5]:
class ReplayMemory:
    def __init__(self, max_length):
        self.memory = deque(maxlen=max_length)
        
    def add(self, state, action, reward, next_state, terminal):
        self.memory.append([state,action,reward, next_state, terminal])
        
    def get_batch(self, batch_size):
        sampling = np.array(random.sample(self.memory, batch_size))
        state_batch = np.stack(sampling[:,0])
        next_state_batch = np.stack(sampling[:,3])
        return state_batch,sampling[:,1], sampling[:,2], next_state_batch, sampling[:,4]
        
    def __len__(self):
        return len(self.memory)

In [6]:
class DQNAgent(tf.keras.Model):
    def __init__(self, state_shape, action_dim, checkpoint_directory, batch_size=32):
        super(DQNAgent, self).__init__()
        self.state_shape = state_shape
        self.action_dim = action_dim

        self.checkpoint_directory = checkpoint_directory

        
        # init q layers
        self.conv1 = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1 = tf.layers.BatchNormalization()
        self.conv2 = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2 = tf.layers.BatchNormalization()
        self.conv3 = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten = tf.layers.Flatten()
        
        self.dense1 = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2 = tf.layers.Dense(action_dim, activation=None)
        
        
        # learning optimizer
        self.optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)

        # epsilon-greedy
        self.epsilon = tfe.Variable(INITIAL_EPSILON)
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORATION_STEPS
        
        # replay_memory
        self.replay_memory = ReplayMemory(10000)
        
        self.batch_size = batch_size
        
    def predict(self, state_batch, training):
        
        if isinstance(state_batch, (np.ndarray, np.generic)):
            state_batch = tf.convert_to_tensor(state_batch)

        
        x = self.conv1(state_batch)
        x = self.batch1(x, training=training)        
        x = self.conv2(x)
        x = self.batch2(x, training=training)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)
        
        return x
    
    def loss(self, state_batch, target, training):
        preds = self.predict(state_batch, training)
        loss_value = tf.losses.mean_squared_error(labels=target, predictions=preds)
        return loss_value
    
    def grad(self, state_batch, target, training):
        with tfe.GradientTape() as tape:
            loss_value = self.loss(state_batch, target, training)
        return tape.gradient(loss_value, self.variables)
        
    
    def get_action(self, state, training=False):
        if training:
            if self.epsilon >= random.random():
                action = tf.convert_to_tensor(random.randrange(self.action_dim))
            else:
                action = tf.argmax(self.predict(state.reshape(-1,105,80,1), training=training),1)    
            
            if self.epsilon > FINAL_EPSILON:
                self.epsilon.assign_sub(self.epsilon_step)
            
            return action 
        
        else:
            return tf.argmax(self.predict(state.reshape(-1,105,80,1), training=training),1)    
        
    
    def fit(self, state, action, reward, next_state, terminal, num_epochs=1):
        
        self.replay_memory.add(state,action,reward,next_state,terminal)
        
        if len(self.replay_memory)<self.batch_size:
            return
        
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.replay_memory.get_batch(self.batch_size)
        

        current_q = self.predict(state_batch, training= False).numpy()
        now_q = current_q.copy() * 0.75
        
        target_q_batch = self.predict(next_state_batch, training=False)
        
        y_batch = reward_batch + (1-terminal_batch) * GAMMA * np.max(target_q_batch, axis = 1)
                
        for i in range(self.batch_size):
            now_q[i,action_batch[i]] = y_batch[i]
        
#         if(terminal_batch[0]):
#             print("r" , reward_batch[0])
#             print("t" , terminal_batch[0])
#             print("q" , np.max(target_q_batch, axis = 1)[0])
#             print("s" , current_q[0])
#             print("y" , now_q[0])
        
        for i in range(num_epochs):
            grads = self.grad(state_batch, now_q, True)
            self.optimizer.apply_gradients(zip(grads, self.variables))
                    
    
    def save(self, global_step=0):
#         print("saving...%i........." % global_step , end='')
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=global_step)
#         print("saved")
        
    def load(self):
        # Run the model once to initialize variables
        dummy_input = tf.constant(tf.zeros(self.state_shape))
        dummy_pred = self.predict(dummy_input, training=False)
        # Restore the variables of the model
        saver = tfe.Saver(self.variables)
        saver.restore(tf.train.latest_checkpoint
                      (self.checkpoint_directory))

In [7]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img)).astype("float32")

In [8]:
import gym
env = gym.make('Breakout-v0')
agent = DQNAgent(state_shape=(1, 105, 80, 1), action_dim=4, checkpoint_directory="./models_checkpoints/rl/", batch_size=32)
agent.load()
with tf.device("gpu:0"):
    for i_episode in range(10000):
        observation = env.reset()
        total_reward = 0
        for t in range(10000000):
            env.render()
            now_state= preprocess(observation).reshape(105,80,1)
            action = agent.get_action(now_state, training=True).numpy()
            observation, reward, done, info = env.step(action)
            if(done):
                done = 1
            else:
                done = 0
            next_state= preprocess(observation).reshape(105,80,1)
            agent.fit(now_state, action, reward, next_state, done)
            total_reward += reward
            if done:
#                 if i_episode % 10 == 0:
#                     print("%d...." % i_episode)
                if i_episode % 50 == 0:
                    print("Episode {} finished after {} timesteps".format(i_episode,t+1))
                    print("reward: %d" % total_reward)
                    print("epsilon: %s"% agent.epsilon.numpy())
                    agent.save(i_episode)
                break

INFO:tensorflow:Restoring parameters from ./models_checkpoints/rl/-600
Episode 0 finished after 189 timesteps
reward: 0
epsilon: 0.820454
Episode 50 finished after 837 timesteps
reward: 0
epsilon: 0.0490596


KeyboardInterrupt: 

In [None]:
agent.epsilon.numpy()

In [None]:
tf.concat((temp[0][0], temp[1][0]), axis=0)

In [None]:
agent.predict(temp, training=True).numpy()

In [None]:
ttt

In [None]:
type(ttt[0])

In [None]:
tf.convert_to_tensor(ttt.astype("float32"))