<a href="https://colab.research.google.com/github/anselmo-pitombeira/Notebooks/blob/master/GridWorld_Q_Learning_Pygame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#GridWorld com Q-learning

Este notebook ilustra uma aplicação do algoritmo Q-learning em um ambiente GridWorld (labirinto).

Um agente (por exemplo, um robô) deve aprender uma política de decisão para ir de uma origem a um destino dentro de um labirinto evitando as paredes. A cada passo que o robô dá, ele recebe uma recompensa negativa igual a -1 (que pode ser entendida como o consumo de sua bateria). O objetivo é treinar o robô de forma que atinja o destino tomando o menor número de passos possíveis, e portanto consumindo o mínimo de bateria.

Os estados possíveis são os locais em que o robô pode estar dentro do labirinto. Em cada local, o robô deve escolher se vai para baixo, cima, esquerda ou direita. Se houver uma parede na direção que ele escolhe ir, o robô permanece no mesmo lugar e recebe uma recompensa de -1.

Ademais, o ambiente é estocástico, de forma que o robô tem uma probabilidade de 10% de não ir para direção desejada (por exemplo, porque pode haver interferência nos sensores do robô). Logo, o robô pode escolher ir para cima mas ir para uma das outras direções com 10% de chance.

O ambiente é implementado com auxílio da biblioteca PyGame.

---



In [8]:
import numpy as np
import pygame
import numpy.random as rd
from itertools import product
import matplotlib.pyplot as plt

In [5]:
##Function definitions

def step(s,a):
  
  """
  Return a new state of the environment and a reward
  """
  (x,y) = s

  #Sample next state:
  if rd.uniform() < 0.99:    ##Go to desired location
      pass

  else:                     ##Go to random location
      a = rd.choice([0,1,2,3])

  if a == 0:     ##Left
      x = x-1    ##Go to the left
          
  elif a == 1:  ##above
      y = y+1    ##Go above

  elif a == 2:  ##right
      x = x+1    ##Go to the right

  else:  ##below
      y = y-1    ##Go below

  newstate = (x,y)

  ##Test if new state is valid
  if newstate not in states or newstate in wall:
      newstate = s ##Remains in place

  reward = -1
   
  return newstate, reward

def draw_grid(screen, grid, wall, terminal):
    
    grid_size = grid.shape[0]
    
    # Set the screen background
    screen.fill(BLACK)
    
    ##Set the wall
    for (i,j) in wall:
        grid[i,j] = 2
        
    ##Set terminal
    grid[terminal[0],terminal[1]] = 3
    
    # Draw the grid
    for row in range(grid_size):
        for column in range(grid_size):
            color = WHITE
            if grid[row][column] == 1:
                color = GREEN
            if grid[row][column] == 2:
                color = RED
            if grid[row][column] == 3:
                color = YELLOW
            pygame.draw.rect(screen,
                             color,
                             [(MARGIN + WIDTH) * column + MARGIN,
                              (MARGIN + HEIGHT) * row + MARGIN,
                              WIDTH,
                              HEIGHT])


def render(screen,grid_size,wall,terminal,state):
    
    (x,y) = state
    
    # Create a 2 dimensional array.
    grid = np.zeros((grid_size,grid_size))
    
    grid[x,y] = 1    ##Set location
    
    draw_grid(screen, grid, wall, terminal)
    
    # Limit to 60 frames per second
    clock.tick(20)
 
    # Go ahead and update the screen with what we've drawn.
    pygame.display.flip()
    
    ##Needed to get rid of events in pygame event queue
    pygame.event.pump()

In [6]:
##Pygame definitions

BLACK = (0, 0, 0)
WHITE = (200, 200, 200)
YELLOW = (255,255,0)
GREEN = (0, 255, 0)
RED = (255, 0, 0)
WINDOW_HEIGHT = 400
WINDOW_WIDTH = 400

# This sets the WIDTH and HEIGHT of each grid location
WIDTH = 20
HEIGHT = 20
 
# This sets the margin between each cell
MARGIN = 5

# Initialize pygame
pygame.init()

# Set the HEIGHT and WIDTH of the screen
WINDOW_SIZE = [255, 255]
screen = pygame.display.set_mode(WINDOW_SIZE)

# Set title of screen
pygame.display.set_caption("GridWorld")

# Loop until the user clicks the close button.
done = False
 
# Used to manage how fast the screen updates
clock = pygame.time.Clock()

grid_size = 10

actions = [0,1,2,3]    ##left, above, right, below

action_size = len(actions)

states = list(product(range(grid_size),range(grid_size)))

wall = [(2,3),(2,4),(2,5),(2,6),(2,7),(2,8),(2,9),
        (3,3),(4,3),(5,3)]

terminal_state = (grid_size-1,grid_size-1)

##Prize of terminal state (boundary condition)
##terminal_prize = 100
terminal_prize = 0

##Array which will store q values
q_value = np.zeros((grid_size, grid_size, action_size))

##Set Q-value of terminal state:
q_value[terminal_state[0],terminal_state[1],:] = terminal_prize

n_episodes = 10
alpha = 0.3
epsilon = 1.0
##gamma = 0.99   ##Discount factor. Can be equals one since there is a terminal state
gamma = 1.0   ##Discount factor. Can be equals one since there is a terminal state
s0 = (0,0)    ##Initial state

t = 0    ##Step counter

total_rewards = []

for i in range(n_episodes):
    
    total_reward = 0

    s = s0

    print("Episode #", i)

    while True:

      ##print("state = ", s)
    
      ##Render GridWorld
      render(screen,grid_size,wall,terminal_state,s)

      (x,y) = s

      ##Determine epsilon-greedy action
      if rd.uniform() < epsilon:
          a  = rd.choice(actions)
      else:
          a = np.argmax(q_value[x,y,:])
      ##print("Action = ", a)

      newstate, reward = step(s,a)
        
      total_reward+=reward

      ##Test if is terminal

      if newstate == terminal_state:
          print("Terminal")
          break
      else: ##Update q-value
          x_new, y_new = newstate
          q_value[x,y,a] = q_value[x,y,a]+alpha*(reward+gamma*np.max(q_value[x_new, y_new,:])-q_value[x,y,a])

      ##Update state
      s = newstate

      #update counter
      t+=1

      ##Update alpha
      ##alpha = alpha*0.999
      ##alpha = 1000/(1000+t)
    epsilon = epsilon*0.9    ##Annealing do epsilon
    if epsilon < 0.001:
      epsilon = 0.0           ##Fully greedy
      ##print(alpha)
        
    print("Total reward = ", total_reward)
    print("Alpha = ", alpha)
    print("Epsilon = ",epsilon)
    total_rewards.append(total_reward)

pygame.quit()

plt.plot(total_rewards)



error: ignored

In [None]:
q_value