In [53]:
import tensorflow as tf
import numpy as np
import retro
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage.transform import resize
from collections import deque
%matplotlib inline

In [2]:
from keras.utils import to_categorical
env = retro.make(game = 'SpaceInvaders-Atari2600')
actions = to_categorical(range(env.action_space.n))

Using TensorFlow backend.


In [85]:
def img_preprocess(frame):
    frame = rgb2gray(frame)
    frame = frame[20:-10,:]
    return resize(frame,(110,84))

#stack size = 4
def stack_frames(stack,frame,is_new_episode = False):
    frame = img_preprocess(frame)
    if is_new_episode:
        stack = deque([np.zeros([110,84]) for _ in range(4)],maxlen=4)
        
        for _ in range(4):
            stack.append(frame)
    else:
        stack.append(frame)
    
    return stack


In [140]:
#Hyper parameters
state_size = [110,84,4]
action_size = env.action_space.n
learning_rate = 0.00025

#training hyperparameters
num_episodes = 50
max_steps = 50000
batch_size = 64

#Exploration hyperparameters
max_epsilon = 1.0
min_epsilon = 0.01
decay = 0.00001

#Q params
gamma = 0.9

#memory
first_time_size = batch_size
max_mem_len = batch_size * 20000

In [141]:
class DQNet:
    def __init__(self,state_size,action_size,learning_rate,name="DQnet"):
        self.learning_rate = learning_rate
        self.action_size = action_size
        self.state_size = state_size
        self.name = name
        
        with tf.variable_scope(self.name):
            self.input_ = tf.placeholder(tf.float32,shape=[None,*self.state_size], name="input")
            self.action_ = tf.placeholder(tf.float32, shape=[None,self.action_size], name = "actions")
            self.Q = tf.placeholder(tf.float32,name="Q")
            
            # Convolutional Layer #1
            self.conv1 = tf.layers.conv2d(
                inputs=self.input_,
                filters=32,
                kernel_size=[8, 8],
                strides=[2,2],
                padding="valid",
                activation=tf.nn.elu,
                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                name = "conv1")
            
            # Convolutional Layer #2
            self.conv2 = tf.layers.conv2d(
                inputs=self.conv1,
                filters=32,
                kernel_size=[4,4],
                strides=[2,2],
                padding="valid",
                activation=tf.nn.elu,
                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                name = "conv2")
            
            self.conv3 = tf.layers.conv2d(
                inputs=self.conv2,
                filters=32,
                kernel_size=[3,3],
                strides=[2,2],
                padding="valid",
                activation=tf.nn.elu,
                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                name = "conv3")
            
            self.flat = tf.layers.flatten(self.conv3,name = "flatten")
            
            self.fc = tf.layers.dense(inputs=self.flat,
                                      units=512,
                                      activation=tf.nn.elu,
                                      name = "fc1")
            self.out = tf.layers.dense(inputs=self.fc,
                                      units = self.action_size,
                                      activation=None,
                                      )
            
            self.Qpred = tf.reduce_sum(tf.multiply(self.out, self.action_), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.Q - self.Qpred))
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,).minimize(self.loss)
            
            

In [142]:
tf.reset_default_graph()

net = DQNet(state_size,action_size,learning_rate)

In [143]:
class Memory:
    def __init__(self,size):
        self.mem = deque(maxlen = size)
        
    def add(self,data):
        self.mem.append(data)
    
    def sample(self,batch_size):
        mem_size = len(self.mem)
        index = np.random.choice(np.arange(mem_size),
                                size = batch_size,
                                replace = False)
        
        return [self.mem[i] for i in index]
        

In [144]:
#Deal with memory for the first time
stacked_frames = []
mem = Memory(max_mem_len)
for i in range(first_time_size):
    if i==0:
        state = env.reset()
        stacked_frames = stack_frames(stacked_frames, state, True)
        state = np.stack(stacked_frames,axis=2)
    #take random action
    action = actions[np.random.randint(low=1,high=len(actions)-1)]
    next_state, reward, done, _ = env.step(action)
    stacked_frames = stack_frames(stacked_frames, next_state, True)
    next_state = np.stack(stacked_frames,axis=2)
    
    if done:
        next_state = np.zeros(state.shape)
        mem.add((state,action,reward,next_state,done))
        #reset
        state = env.reset()
        stacked_frames = stack_frames(stacked_frames, state, True)
        state = np.stack(stacked_frames,axis=2)
    
    else:
        mem.add((state,action,reward,next_state,done))
        state = next_state

In [145]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", net.loss)

write_op = tf.summary.merge_all()



In [146]:
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        choice = np.random.randint(1,len(actions))-1
        action = actions[choice]
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(net.out, feed_dict = {net.input_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = actions[choice]
                
                
    return action, explore_probability

In [147]:
# Saver will help us to save our model
#train?
# training = False
training = True
episode_render = False
rewards_list = []
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())

        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0
        
        for episode in range(num_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            stacked_frames = stack_frames(stacked_frames, state, True)
            state = np.stack(stacked_frames, axis=2)
            
            while step < max_steps:
                step += 1
                
                #Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = predict_action(max_epsilon, min_epsilon, decay, decay_step, state, actions)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = env.step(action)
                
                if episode_render:
                    env.render()
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((110,84), dtype=np.int)
                    
                    stacked_frames = stack_frames(stacked_frames, next_state, False)
                    next_state = np.stack(stacked_frames, axis=2)
                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))

                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    mem.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    stacked_frames = stack_frames(stacked_frames, next_state, False)
                    next_state = np.stack(stacked_frames, axis=2)
                    # Add experience to memory
                    mem.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = mem.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state 
                Qs_next_state = sess.run(net.out, feed_dict = {net.input_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([net.loss, net.optimizer],
                                        feed_dict={net.input_: states_mb,
                                                   net.Q: targets_mb,
                                                   net.action_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={net.input_: states_mb,
                                                       net.Q: targets_mb,
                                                       net.action_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: 270.0 Explore P: 0.9706 Training Loss 2.1497
Model Saved
Episode: 1 Total reward: 350.0 Explore P: 0.9397 Training Loss 2.3392
Episode: 2 Total reward: 155.0 Explore P: 0.9174 Training Loss 4.9128
Episode: 3 Total reward: 210.0 Explore P: 0.8954 Training Loss 0.7245
Episode: 4 Total reward: 415.0 Explore P: 0.8696 Training Loss 3.6712
Episode: 5 Total reward: 210.0 Explore P: 0.8489 Training Loss 9.7303
Model Saved


KeyboardInterrupt: 