In [0]:
import gym
import sys
import pylab
import random
import os
import operator
from collections import deque

from skimage import io, color, transform

import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
GAME_TYPE = ''
env =gym.make("MsPacman-v0")

In [5]:
#Hyperparameters

#Environment parameters
NUM_EPISODES=80000000
#We feed the model 4 frames at a time
PHI_LENGTH=4

#Agent parameters
EPSILON=1
EXPERIENCE_REPLAY_CAPACITY=2000
MINIBATCH_SIZE=100
LEARNING_RATE=0.01
ACTION_SIZE=env.action_space.n
EXPLORE=3000000
UPDATE_RATE=10000

PREPROCESS_IMAGE_DIM=84 #We downsize the atari frame to 84 x 84
STATE_SIZE=(PREPROCESS_IMAGE_DIM,PREPROCESS_IMAGE_DIM,4)

print(ACTION_SIZE)

9


In [0]:
class Agent:
    #
    #Initialization
    #
    def __init__(self, state_size,epsilon , experience_replay_capacity , minibatch_size , learning_rate ,action_size, img_dim,explore):
        self.state_size=state_size
        self.action_size=action_size
        self.discount_factor=0.90
        self.learning_rate=learning_rate
        self.epsilon=epsilon
        self.epsilon_min=0.05
        self.batch_size=minibatch_size
        self.train_start=1000
        self.explore=explore
        self.img_channels=4 #phi_length  #coz we feed in 4 stacked b&w imgs instead of 1 rbg img
        self.processed_image_dim=img_dim
        
                
         # create replay memory using deque
        self.D=deque(maxlen=experience_replay_capacity)
        # create main model and target model
        self.model=self.build_model()
        self.target_model=self.build_model()
        self.update_target_model()
       
    
    def build_model(self):
        class model(nn.Module):
          def __init__(self, in_c=4, n_actions=9):
            super().__init__()
            self.conv1 = nn.Conv2d(in_c, 32, kernel_size=8, stride=4, padding=3)
            self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1)
            self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
            self.fc1 = nn.Linear(6400, 6400)
            self.fc2 = nn.Linear(6400,1024)
            self.fc3 = nn.Linear(1024,n_actions)
        
          def forward(self, x):
            x = self.conv1(x)
            x = F.relu(x)
            x = self.conv2(x)
            x = F.relu(x)
            x = self.conv3(x)
            x = F.relu(x)
            x = x.view(x.size(0), -1) 
            x = self.fc1(x)
            x = F.relu(x)
            x = self.fc2(x)
            x = F.relu(x) 
            x = self.fc3(x)
            return x   
        
        
        model=model()
             
        print("finish building the model")
        print(model)
        
        
        return model
         
    
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())
        
    def append_experience_replay_example(self,s_t,a_t,r_t,s_t1,done):
        """
        Add an experience replay example to our agent's replay memory. If
        memory is full, overwrite previous examples, starting with the oldest
        """
        #D is a memory cell
        #Records State,Action,Reward,Next State and the boolean done
        self.D.append((s_t, a_t, r_t, s_t1, done))
    
    def preprocess_observation(self, observation, prediction=False):
        """
        Helper function for preprocessing an observation for consumption by our
        deep learning network
        """
        grayscale_observation = color.rgb2gray(observation)
        resized_observation = transform.resize(grayscale_observation, (1,self.processed_image_dim, self.processed_image_dim)).astype('float32')
        if prediction:
            resized_observation = np.expand_dims(resized_observation,0)
        return resized_observation
    
    
    def take_action(self, s_t):
        """
        Given an observation, the model attempts to take an action
        according to its q-function approximation
        """
        #We take an action based on our current epsilon value
        #This is called Epsilon greedy exploration/exploitation
        
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        print("Taking action based on our current epsilon value") 
        s_t=torch.from_numpy(s_t)
        s_t=s_t.reshape(1,4,84,84)
        actions=self.model(s_t)
        actions=actions.detach().numpy()
        return np.argmax(actions[0])
    
    def learn(self, batch_size):
        criterion = nn.SmoothL1Loss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.00015)
        
        minibatch = random.sample(self.D, batch_size)
 
        
        for s_t, a_t, r_t, s_t1, done in minibatch:
            s_t=torch.from_numpy(s_t)
            s_t1=torch.from_numpy(s_t1)
            s_t=s_t.reshape(1,4,84,84)
            s_t1=s_t1.reshape(1,4,84,84)
            if not done:
                future_q=(self.model(s_t1)).detach().numpy()
                max_action = np.argmax(future_q)
                labels_q=(self.target_model(s_t1)).detach().numpy()
                # True Q-value according to the saved weights of our target model
                return_ = (r_t + self.discount_factor * labels_q[0][max_action])
            else:
                return_ = r_t   
            # Use the current model to output the Q-value predictions
            current_q = self.model(s_t)
            return_=torch.FloatTensor(np.array(return_))
            # Selecting the Q-value according to the action taken by current model
            current_q[0][a_t]=torch.FloatTensor(current_q[0][a_t].detach().numpy())
            loss = criterion(current_q[0][a_t],return_)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
        if self.epsilon > self.epsilon_min:
            self.epsilon -= (self.epsilon - self.epsilon_min) /self.explore


           

In [0]:
def run_simulation():
    """
    Entry-point for running env simulation
    """

    #print game parameters
    print ("~~~Environment Parameters~~~")
    print ("Num episodes: %s" % NUM_EPISODES)
    print ("Action space: %s" % env.action_space)
    print()
    print ("~~~Agent Parameters~~~")
    print ("Epsilon: %s" % EPSILON)
    print ("Experience Replay Capacity: %s" % EXPERIENCE_REPLAY_CAPACITY)
    print ("Minibatch Size: %s" % MINIBATCH_SIZE)
    print ("Learning Rate: %s" % LEARNING_RATE)

    #initialize agent
    agent = Agent(state_size=STATE_SIZE,epsilon=EPSILON,
                experience_replay_capacity=EXPERIENCE_REPLAY_CAPACITY,
                minibatch_size=MINIBATCH_SIZE,
                learning_rate=LEARNING_RATE, action_size =ACTION_SIZE, img_dim =PREPROCESS_IMAGE_DIM ,explore =EXPLORE)
    
    scores, episodes = [], [] 

    #initialize auxiliary data structures
    state_list = [] 
    #tot_frames = 0

    for i_episode in range(NUM_EPISODES):
        print ("Episode: %s" % i_episode)
        tot_frames=0
        done = False
        score = 0
        x_t=env.reset()
        x_t=agent.preprocess_observation(x_t)   
        s_t=np.stack((x_t, x_t, x_t, x_t), axis=3) 
        #how many consecutive frames to stack depends on your PHI
        
        while not done:
          #env.render()
          # get action for the current state and go one step in environment
          a_t=agent.take_action(s_t)
          x_t1,r_t,done,_=env.step(a_t)
          # get action, change score and learn from memory
          score+=r_t
          
          x_t1=agent.preprocess_observation(x_t1)   
          x_t1 = x_t1.reshape(x_t1.shape[0], x_t1.shape[1], x_t1.shape[2],1)
          

          s_t1 = np.append(x_t1, s_t[ : , :, :, :3], axis=3)
          agent.append_experience_replay_example(s_t,a_t,r_t,s_t1,done)

          
          s_t=s_t1

        if done:
          # every episode update the target model to be same with model
          agent.update_target_model() 
          scores.append(score)
          episodes.append(i_episode)
          

          print( "  score:", score, "  epsilon:", agent.epsilon)
          
        while True:
          #ensure state list is populated
          if tot_frames < PHI_LENGTH:
            state_list.append(x_t)
            tot_frames+=1
            

            continue
            
          else:
            #update state list with next observation
            state_list.append(x_t)
            state_list.pop(0)

            break
        
        agent.learn(MINIBATCH_SIZE)

In [11]:
if __name__ == "__main__":
  env =gym.make("MsPacman-v0")    
  run_simulation()

~~~Environment Parameters~~~
Num episodes: 80000000
Action space: Discrete(9)

~~~Agent Parameters~~~
Epsilon: 1
Experience Replay Capacity: 2000
Minibatch Size: 100
Learning Rate: 0.01
finish building the model
model(
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4), padding=(3, 3))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=6400, out_features=6400, bias=True)
  (fc2): Linear(in_features=6400, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=9, bias=True)
)
finish building the model
model(
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4), padding=(3, 3))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=6400, out_features=6400, bias=True)
  (fc2): Linear(in_features=6400,

KeyboardInterrupt: ignored