# Implementation of GridWorld

As described in Chapter 3 of Sutton & Barto Book on Reinforcement Learning.

In [1]:
import numpy as np
from gamelearner import *

In [2]:
class GridWorldGame:
    """Simulates the game of Gridworld.

    Class attributes:
        GridWorldGame.name (str): The game's name ('Tic Tac Toe').
        GridWorldGame.size (int): Width (and height) of grid.
        roles [int]: The player role (there is only one).
        GridWorldGame.possible_n_players (list): List of allowed
            numbers of players ([1]).
        TicTacToeGame.marks (list): The characters used to represent
            each role's move on the board (['X']).
        TicTacToeGame.help_text (dict): Various messages (strings)
            to help user.
    """

    name = 'Grid World'
    SIZE = 5
    default_shape = (SIZE, SIZE)
    roles = [1]
    possible_n_players = [1]
    marks = ['X']

    help_text = {
        'Move format': "l, r, u, or d",
        'Move not available': "That move is not possible.",
        'Number of players': "This game is only for 1 player."
    }

    A_POS = (0, 1)
    A_PRIME_POS = (4, 1)
    B_POS = (0, 3)
    B_PRIME_POS = (2, 3)

    def __init__(self, start_pos=(0, 0), moves=None, shape=default_shape,
                 a_pos=A_POS, a_prime_pos=A_PRIME_POS, b_pos=B_POS, 
                 b_prime_pos=B_PRIME_POS, discount=0.9):
        """Initialize a game.

        Args:
            moves (list): This is optional. Provide a list of completed
                moves. Each move should be a char indicating the move 
                ('l', 'r', 'u', 'd').
        """
        
        self.n_players = 1
        self.start_pos = start_pos
        self.shape = shape
        self.a_pos = a_pos
        self.a_prime_pos = a_prime_pos
        self.b_pos = b_pos
        self.b_prime_pos = b_prime_pos
        self.actions = {
            'l': np.array([0, -1]),
            'u': np.array([-1, 0]),
            'r': np.array([0, 1]),
            'd': np.array([1, 0])
        }
        self.reverse_actions = {
            'l': 'r',
            'u': 'd',
            'r': 'l',
            'd': 'u'           
        }
        self.discount = discount
        self.start_time = None
        self.end_time = None
        self.game_over = False
        self.reset()
        if moves is not None:
            for move in moves:
                self.make_move(move)
            self.start()

    def start(self):
        """Record start time (self.start_time)."""

        self.start_time = datetime.datetime.now()

    def stop(self):
        """Record end time (self.end_time)."""

        self.end_time = datetime.datetime.now()
            
    def reset(self):
        """Set the state of the game to the beginning (no moves).
        """

        self.moves = []
        self.state = np.array(self.start_pos)
        self.game_over = False
        self.start_time = None
        self.end_time = None
        self.turn = self.roles[0]

    def show_state(self):
        """Display the current state of the gridworld."""

        chars = '_abAB' + self.marks[0]
        
        states = np.zeros(self.shape, dtype='b')
        states[self.a_pos] = 1
        states[self.b_pos] = 2
        states[self.a_prime_pos] = 3
        states[self.b_prime_pos] = 4
        states[tuple(self.state)] = 5
        
        for row in states:
            print(" ".join(list(chars[i] for i in row)))

    def available_moves(self, state=None):
        """Returns list of available moves ('l', 'r', 'u', 'd').
        """

        return list(self.actions.keys())

    def update_state(self, move, state=None, set_rewards=False):
        """Updates the game state with the move to be taken.

        Args:
            move (string): Player's move ('l', 'r', 'u', or 'd').
            state (np.ndarray): Array (size (3, 3)) of game state or if
                not provided the current game state will be used.
            set_rewards (bool): If True, self.rewards will be set after
                based on the move made.

        Raises:
            ValueError if the position is out of bounds or if
            there is already a move in that position.
        """

        if state is None:
            state = self.state

        if np.all(self.state == self.a_pos):
            state[:] = self.a_prime_pos
            reward = 10.0
        elif np.all(self.state == self.b_pos):
            state[:] = self.b_prime_pos
            reward = 5.0
        else:
            state += self.actions[move[1]]
            x, y = state
            if x < 0 or x >= self.shape[0] or y < 0 or y >= self.shape[1]:
                reward = -1.0
                state -= self.actions[move[1]]
            else:
                reward = 0.0
        
        if set_rewards:
            self.reward = reward

    def next_state(self, move, state=None):
        """Returns the next state of the game if move were to be
        taken from current game state or from state if provided.

        Args:
            move (tuple): Tuple containing player role (1) and a move
                ('l', 'r', 'u', or 'd').
            state (np.ndarray): Array (size (2,)) of game state or if
                                not provided the current game state will
                                be used.

        Returns:
            next_state (np.ndarray): copy of state after move made.
        """

        if state is None:
            state = self.state
        next_state = state.copy()
        self.update_state(move, state=next_state)

        return next_state

    def make_move(self, move, show=False):
        """Update the world with a new move.

        Args:
            move (tuple): Tuple containing player role (1) and a move
                ('l', 'r', 'u', or 'd').
            show (bool): Print a message if True.
        """

        if move[1] not in self.available_moves():
            raise ValueError(self.help_text['Move not available'])

        self.update_state(move, set_rewards=True)
        self.moves.append(move)
        if show:
            print("Player made move %s" % move)

    def reverse_move(self, show=False):
        """Reverse the last move made.

        Args:
            show (bool): Print a message if True.
        """

        # TODO: This reverses the state but does not restore the 
        # previous reward
        last_move = self.moves.pop()
        if np.all(self.state == self.a_prime_pos):
            self.state[:] = self.a_pos
        elif np.all(self.state == self.b_prime_pos):
            self.state[:] = self.b_pos
        else:
            reverse_move = (last_move[0], self.reverse_actions[last_move[1]])
            self.state += self.actions[reverse_move[1]]

        if show:
            print("Last move reversed")

    def get_rewards(self):
        """Returns the reward for the player at the current time step
        as a dictionary.
        """

        return {self.turn: self.reward}

    def generate_action_key(self, state, move):
        """Converts a game state in the form of an array into an
        integer.  This is used by TDLearner to create unique 
        hashable keys for storing values in a dictionary.

        Args:
            state (np.ndarray): Game state array (shape may depend
                on the game) of type int.
            move (role, action): Tuple containing player role
                and move ('l', 'r', 'u', or 'd').

        Returns:
            key (int): unique integer representation of game state.
        """

        assert 0 <= state[0] < game.shape[0]
        assert 0 <= state[1] < game.shape[1]

        role, action = move
        assert action in self.actions.keys()

        return tuple(state), action

    def __repr__(self):

            params = []
            if self.start_pos != (0, 0):
                params.append("start_pos=%s" % self.start_pos.__repr__())
            if self.moves:
                params.append("moves=%s" % self.moves.__repr__())
            if self.shape != self.default_shape:
                params.append("shape=%s" % self.shape.__repr__())
            return "GridWorldGame(%s)" % ', '.join(params)


In [3]:
state = np.array([1, 1], dtype='b')
state

array([1, 1], dtype=int8)

In [4]:
game = GridWorldGame()
game

GridWorldGame()

In [5]:
game.roles, game.turn

([1], 1)

In [6]:
game.show_state()

X a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


In [7]:
game.state

array([0, 0])

In [8]:
game.available_moves()

['l', 'u', 'r', 'd']

In [9]:
game.make_move((1, 'd'))

In [10]:
game.get_rewards()

{1: 0.0}

In [11]:
game.show_state()

_ a _ b _
X _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


In [12]:
game.generate_action_key(game.state, (1, 'r'))

((1, 0), 'r')

In [13]:
game.make_move((1, 'l'))
game.show_state()

_ a _ b _
X _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


In [14]:
game.get_rewards()

{1: -1.0}

In [15]:
game.make_move((1, 'r'))
game.show_state()

_ a _ b _
_ X _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


In [16]:
game.make_move((1, 'u'))
game.show_state()

_ X _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


In [17]:
random_move = np.random.choice(list(game.actions.keys()))
game.make_move((1, random_move))
game.show_state()

_ a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ X _ _ _


In [18]:
game.get_rewards()

{1: 10.0}

In [19]:
game.moves

[(1, 'd'), (1, 'l'), (1, 'r'), (1, 'u'), (1, 'l')]

In [20]:
game.make_move((1, 'u'))
game.show_state()

_ a _ b _
_ _ _ _ _
_ _ _ B _
_ X _ _ _
_ A _ _ _


In [21]:
game.reverse_move()
game.show_state()

_ a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ X _ _ _


In [22]:
game.moves

[(1, 'd'), (1, 'l'), (1, 'r'), (1, 'u'), (1, 'l')]

In [23]:
game.reset()

In [24]:
game.show_state()

X a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _


### Play a game with a computer player

In [25]:
players = [TDLearner("TD1"), TDLearner("TD2")]
try:
    ctrl = GameController(game, players)
except AssertionError as err:
    print(err)

This game is only for 1 player.


In [26]:
td = TDLearner("TD1")
players = [td]
ctrl = GameController(game, players)

In [27]:
ctrl.play(10)

Game of Grid World with 1 players ['TD1']
X a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): d
_ a _ b _
X _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): d
_ a _ b _
_ _ _ _ _
X _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): l
_ a _ b _
_ _ _ _ _
X _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): l
_ a _ b _
_ _ _ _ _
X _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): r
_ a _ b _
_ _ _ _ _
_ X _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): l
_ a _ b _
_ _ _ _ _
X _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): d
_ a _ b _
_ _ _ _ _
_ _ _ B _
X _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): l
_ a _ b _
_ _ _ _ _
_ _ _ B _
X _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): u
_ a _ b _
_ _ _ _ _
X _ _ B _
_ _ _ _ _
_ A _ _ _
TD1's turn (l, r, u, or d): r


In [28]:
td.value_function

{((0, 0), 'l'): 0.5,
 ((0, 0), 'u'): 0.5,
 ((0, 0), 'r'): 0.5,
 ((0, 0), 'd'): 0.5,
 ((1, 0), 'l'): 0.5,
 ((1, 0), 'u'): 0.5,
 ((1, 0), 'r'): 0.5,
 ((1, 0), 'd'): -0.5,
 ((2, 0), 'l'): -0.25,
 ((2, 0), 'u'): 0.5,
 ((2, 0), 'r'): 0.5,
 ((2, 0), 'd'): -0.5,
 ((2, 1), 'l'): 0.5,
 ((2, 1), 'u'): 0.5,
 ((2, 1), 'r'): 0.5,
 ((2, 1), 'd'): 0.5,
 ((3, 0), 'l'): 0.5,
 ((3, 0), 'u'): 0.5,
 ((3, 0), 'r'): 0.5,
 ((3, 0), 'd'): 0.5}

In [29]:
list(td.saved_game_actions.values())[0]

[((0, 0), 'd'),
 ((1, 0), 'd'),
 ((2, 0), 'l'),
 ((2, 0), 'l'),
 ((2, 0), 'r'),
 ((2, 1), 'l'),
 ((2, 0), 'd'),
 ((3, 0), 'l'),
 ((3, 0), 'u'),
 ((2, 0), 'r')]

### Average rewards in each position

In [30]:
values = np.zeros(game.shape[0]*game.shape[1]).reshape(game.shape)
role = 1

for row in range(game.shape[0]):
    for col in range(game.shape[1]):
        game = GridWorldGame(start_pos=(row, col))
        rewards = []
        for a in game.available_moves():
            game.make_move((role, a))
            rewards.append(game.get_rewards()[role])
            game.reverse_move()
        assert len(rewards) == 4
        values[row, col] = sum(rewards)/len(rewards)

for row in values:
    print("%5.2f "*len(row) % tuple(row))

 7.25 10.00 -0.25  5.00 -0.50 
-0.25  0.00  0.00  0.00 -0.25 
-0.25  0.00  1.25  0.00  3.75 
-0.25  0.00  0.00  2.50 -0.25 
-0.50 -0.25  7.50 -0.25 -0.50 


### Find discounted values of each position by random movement

In [31]:
td = TDLearner(learning_rate=0.25, off_policy_rate=1.0)  # always random moves
role = game.roles[0]

for i in range(15000):
    random_start_pos = tuple(random.randint(0, s-1) for s in game.shape)
    game = GridWorldGame(start_pos=random_start_pos)
    td.make_move(game=game, role=role)
    td.make_move(game=game, role=role)  # Only get reward on second move
    reward = game.get_rewards()[role]
    td.on_policy = True   # Fool it so that it updates value function
    td.reward(game=game, role=role, reward=reward)
    if i == 5000:
        td.learning_rate = 0.1

In [32]:
list(td.value_function.items())[0:10]

[(((2, 3), 'd'), -3.551507096538684),
 (((3, 3), 'l'), -5.8193113737861495),
 (((2, 3), 'l'), 10.428411354510283),
 (((2, 2), 'r'), 10.421755484956817),
 (((4, 4), 'd'), -24.2977437991157),
 (((2, 4), 'l'), 4.207747438584605),
 (((2, 3), 'r'), -1.2410274912674102),
 (((0, 3), 'r'), 7.560383047216166),
 (((0, 3), 'u'), 9.039087834897899),
 (((1, 4), 'd'), -9.159707457392146)]

In [33]:
len(td.value_function), game.shape[0]*game.shape[1]*len(game.actions)

(100, 100)

In [34]:
values = np.zeros(game.shape[0]*game.shape[1]).reshape(game.shape)

for row in range(game.shape[0]):
    for col in range(game.shape[1]):
        x = [td.get_value(((row, col), a)) for a in game.actions.keys()]
        max_value = max(x)
        values[row, col] = max_value

for row in values:
    print("%5.1f "*len(row) % tuple(row))

 86.4 -12.2  85.6   9.0  54.3 
 43.9  86.3  60.0  55.4  26.1 
 22.5  25.4  32.5  26.8   9.2 
  5.5  11.0  15.2  11.4   0.6 
 -7.9   0.4  -4.6  -7.7 -14.8 


In [35]:
GridWorldGame().show_state()

X a _ b _
_ _ _ _ _
_ _ _ B _
_ _ _ _ _
_ A _ _ _
