In [1]:
import gym
import sys
import pylab
import random
import os
import operator
from collections import deque

from skimage import io, color, transform

import numpy as np
import tensorflow as tf
from keras.models import Model,Sequential
from keras.layers import Flatten,Conv2D,Input,Dense,MaxPooling2D,Activation
import keras.backend as K
from keras.optimizers import RMSprop

GAME_TYPE = ''
env =gym.make("MsPacman-v0")

Using TensorFlow backend.


In [2]:
#Hyperparameters

#Environment parameters
NUM_EPISODES=80000000
#We feed the model 4 frames at a time
PHI_LENGTH=4

#Agent parameters
EPSILON=1
EXPERIENCE_REPLAY_CAPACITY=2000
MINIBATCH_SIZE=100
LEARNING_RATE=0.01
ACTION_SIZE=env.action_space.n
EXPLORE=3000000
UPDATE_RATE=10000

PREPROCESS_IMAGE_DIM=84 #We downsize the atari frame to 84 x 84
STATE_SIZE=(PREPROCESS_IMAGE_DIM,PREPROCESS_IMAGE_DIM,4)
#STATE_SIZE=(88,80,1)
#Adding a parameter loss function to experiment with different losses
print(ACTION_SIZE)

9


In [0]:
class Agent:
    #
    #Initialization
    #
    def __init__(self, state_size,epsilon , experience_replay_capacity , minibatch_size , learning_rate ,action_size, img_dim,explore):
        #self.loss_func=loss_func
        self.state_size=state_size
        self.action_size=action_size
        self.discount_factor=0.90
        self.learning_rate=learning_rate
        self.epsilon=epsilon
        self.epsilon_min=0.05
        self.batch_size=minibatch_size
        self.train_start=1000
        self.explore=explore
        self.img_channels=4 #phi_length  #coz we feed in 4 stacked b&w imgs instead of 1 rbg img
        self.processed_image_dim=img_dim
        
         # create replay memory using deque
        self.D=deque(maxlen=experience_replay_capacity)
        # create main model and target model
        self.model=self.build_model()
        self.target_model=self.build_model()
        #self.target_model.set_weights(self.model.get_weights())
        # initialize target model
        self.update_target_model()
        #self.model.summary()
    
    def build_model(self):
        model=Sequential()
        
        model.add(Conv2D(32,(8,8),strides=4,padding="same",input_shape=self.state_size))
        model.add(Activation("relu"))
        
        model.add(Conv2D(64,(4,4),strides=2,padding="same"))
        model.add(Activation("relu"))
        
        model.add(Conv2D(64,(3,3),strides=1,padding="same"))
        model.add(Activation("relu"))
        model.add(Flatten())
        
        model.add(Dense(128, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=None, decay=0.0))
        
        print("finish building the model")
        print(model.summary())
        
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def append_experience_replay_example(self,s_t,a_t,r_t,s_t1,done):
        """
        Add an experience replay example to our agent's replay memory. If
        memory is full, overwrite previous examples, starting with the oldest
        """
        #D is a memory cell
        #Records State,Action,Reward,Next State and the boolean done
        self.D.append((s_t, a_t, r_t, s_t1, done))
    
    def preprocess_observation(self, observation, prediction=False):
        """
        Helper function for preprocessing an observation for consumption by our
        deep learning network
        """
        grayscale_observation = color.rgb2gray(observation)
        resized_observation = transform.resize(grayscale_observation, (1,self.processed_image_dim, self.processed_image_dim)).astype('float32')
        if prediction:
            resized_observation = np.expand_dims(resized_observation,0)
        return resized_observation
    
    #This idea is copied from https://github.com/ageron/tiny-dqn/blob/master/tiny_dqn.py
    #def preprocess_observation(self,frame):
        #mspacman_color = np.array([210, 164, 74]).mean()
        #img = frame[1:176:2, ::2]    # Crop and downsize
        #img = img.mean(axis=2)       # Convert to greyscale
        #img[img==mspacman_color] = 0 # Improve contrast by making pacman white
        #img = (img - 128) / 128 - 1  # Normalize from -1 to 1.
        #return np.expand_dims(img.reshape(88, 80, 1), axis=0)
    
    def take_action(self, s_t):
        """
        Given an observation, the model attempts to take an action
        according to its q-function approximation
        """
        #We take an action based on our current epsilon value
        #This is called Epsilon greedy exploration/exploitation
        
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        actions=self.model.predict(s_t)
        return np.argmax(actions[0])
    
    def learn(self, batch_size):
        
        minibatch = random.sample(self.D, batch_size)
        
        for s_t, a_t, r_t, s_t1, done in minibatch:
            if not done:
                max_action = np.argmax(self.model.predict(s_t1)[0])
                target = (r_t + self.discount_factor * self.target_model.predict(s_t1)[0][max_action])
            else:
                target = r_t
            #The idea of target vector taken from a medium post:
            # 1. Use the current model to output the Q-value predictions
            target_f = self.model.predict(s_t)
            # 2. Rewrite the chosen action value with the computed target
            target_f[0][a_t]=target
            # 3. Use vectors in the objective computation
            self.model.fit(s_t, target_f, epochs=1, verbose=0)
    
        if self.epsilon > self.epsilon_min:
            self.epsilon -= (self.epsilon - self.epsilon_min) /self.explore
    
    def train_minibatch(self,batch_size):
    
      """Prepare X_batch, y_batch and train them
      Recall our loss function is
          target = reward + discount * max Q(s',a)
                   or reward if done early
          Loss function: [target - Q(s, a)]^2
      Hence,
          X_batch is a state list
          y_batch is reward + discount * max Q
                     or reward if terminated early
      Args:
          DQN (dqn.DQN): DQN Agent to train & run
          train_batch (list): Minibatch of Replay memory
              Eeach element is a tuple of (s, a, r, s', done)
      Returns:
          loss: Returns a loss
      """
      train_batch = random.sample(self.D, batch_size)
      state_array = np.vstack([x[0] for x in train_batch])
      action_array = np.array([x[1] for x in train_batch])
      reward_array = np.array([x[2] for x in train_batch])
      next_state_array = np.vstack([x[3] for x in train_batch])
      done_array = np.array([x[4] for x in train_batch])
      print(state_array.shape)
      X_batch = state_array
      y_batch = self.model.predict(state_array)

      Q_target = reward_array + self.discount_factor * np.max(self.model.predict(next_state_array), axis=1)* ~done_array
      y_batch[np.arange(len(X_batch)), action_array] = Q_target

      # Train our network using target and predicted Q values on each episode
      self.model.fit(X_batch, y_batch)

           

In [0]:
def run_simulation():
    """
    Entry-point for running env simulation
    """

    #print game parameters
    print ("~~~Environment Parameters~~~")
    print ("Num episodes: %s" % NUM_EPISODES)
    print ("Action space: %s" % env.action_space)
    print()
    print ("~~~Agent Parameters~~~")
    print ("Epsilon: %s" % EPSILON)
    print ("Experience Replay Capacity: %s" % EXPERIENCE_REPLAY_CAPACITY)
    print ("Minibatch Size: %s" % MINIBATCH_SIZE)
    print ("Learning Rate: %s" % LEARNING_RATE)

    #initialize agent
    agent = Agent(state_size=STATE_SIZE,epsilon=EPSILON,
                experience_replay_capacity=EXPERIENCE_REPLAY_CAPACITY,
                minibatch_size=MINIBATCH_SIZE,
                learning_rate=LEARNING_RATE, action_size =ACTION_SIZE, img_dim =PREPROCESS_IMAGE_DIM ,explore =EXPLORE)
    
    scores, episodes = [], [] 

    #initialize auxiliary data structures
    state_list = [] 
    #tot_frames = 0

    for i_episode in range(NUM_EPISODES):
        print ("Episode: %s" % i_episode)
        tot_frames=0
        done = False
        score = 0
        x_t=env.reset()
        x_t=agent.preprocess_observation(x_t)   
        s_t=np.stack((x_t, x_t, x_t, x_t), axis=3) 
        #how many consecutive frames to stack depends on your PHI
        
        while not done:
          #env.render()
          # get action for the current state and go one step in environment
          a_t=agent.take_action(s_t)
          x_t1,r_t,done,_=env.step(a_t)
          # get action, change score and learn from memory
          score+=r_t
          
          x_t1=agent.preprocess_observation(x_t1)   
          x_t1 = x_t1.reshape(x_t1.shape[0], x_t1.shape[1], x_t1.shape[2],1)
          #x_t1 = np.reshape(x_t1, (84, 84, 1))

          s_t1 = np.append(x_t1, s_t[ : , :, :, :3], axis=3)
          agent.append_experience_replay_example(s_t,a_t,r_t,s_t1,done)

          #FILL THIS
          s_t=s_t1

        if done:
          # every episode update the target model to be same with model
          agent.update_target_model() 
          scores.append(score)
          episodes.append(i_episode)
          

          print( "  score:", score, "  epsilon:", agent.epsilon)
          
        while True:
          #ensure state list is populated
          if tot_frames < PHI_LENGTH:
            state_list.append(x_t)
            tot_frames+=1
            #print(tot_frames)
            #print(state_list)

            continue
            
          else:
            #update state list with next observation
            state_list.append(x_t)
            state_list.pop(0)

            break
        
        agent.learn(MINIBATCH_SIZE)

In [39]:
if __name__ == "__main__":
  env =gym.make("MsPacman-v0")    
  run_simulation()

~~~Environment Parameters~~~
Num episodes: 100
Action space: Discrete(9)

~~~Agent Parameters~~~
Epsilon: 1
Experience Replay Capacity: 2000
Minibatch Size: 100
Learning Rate: 0.01
finish building the model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_67 (Conv2D)           (None, 21, 21, 32)        8224      
_________________________________________________________________
activation_67 (Activation)   (None, 21, 21, 32)        0         
_________________________________________________________________
conv2d_68 (Conv2D)           (None, 11, 11, 64)        32832     
_________________________________________________________________
activation_68 (Activation)   (None, 11, 11, 64)        0         
_________________________________________________________________
conv2d_69 (Conv2D)           (None, 11, 11, 64)        36928     
_________________________________________________________________
a

KeyboardInterrupt: ignored