In [1]:
''' Game Parameters '''

# board dimensions
BOARD_LENGTH = 5
BOARD_WIDTH = 5

# game key positions
START = (1, 1)
WIN = (2, 3)
LOSE = (3, 2)
OBSTACLES = [(1, 3), (3, 1)]

# options
INPUT_STEP = True
EXPLORATION_RATE = 0.3
LEARNING_RATE = 0.3
NUMBER_ROUNDS = 10

In [6]:
''' State '''

# import libraries and variables
import numpy
from parameters import *

# define State class
class State:

    def __init__(self, state=START, input_step=INPUT_STEP):
        self.board = numpy.zeros(BOARD_LENGTH, BOARD_WIDTH)
        self.state = state
        self.endGame = False
        self.inputStep = INPUT_STEP

    def valueReward(self):
        if self.state == WIN:
            self.endGame = True
            return 1
        elif self.state == LOSE:
            self.endGame= True
            return -1
        else:
            return 0

    def nextPosition(self, action):
        'Falta checkear OBSTACLES y INPUT_STEP'
        if action == 'up':
            nextState = (self.State[0], self.State[1] + 1)
        elif action == 'down':
            nextState = (self.State[0], self.State[1] - 1)
        elif action == 'right':
            nextState = (self.State[0] + 1, self.State[1])
        elif action == 'left':
            nextState = (self.State[0] - 1, self.State[1])
        elif action == 'up-right':
            nextState = (self.State[0] + 1, self.State[1] + 1)
        elif action == 'up-left':
            nextState = (self.State[0] + 1, self.State[1] - 1)
        elif action == 'down-right':
            nextState = (self.State[0] + 1, self.State[1] - 1)
        elif action == 'down-left':
            nextState = (self.State[0] - 1, self.State[1] - 1)
        return nextState

    def plotBoard(self):
        'Falta funciÃ³n de plot'



In [31]:
import numpy as np

# global variables
BOARD_ROWS = 5
BOARD_COLS = 7
WIN_STATE = (0, 3)
LOSE_STATE = (1, 3)
START = (2, 0)
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[1, 1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state == LOSE_STATE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                    if nxtState != (1, 1):
                        return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds=10):
        i = 0
        j = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward  # this is optional
                print("Game {} End Reward {}: #movements {}".format(i, reward, j))
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                self.reset()
                i += 1
                j = 0
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                #print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                #print("nxt state", self.State.state)
                #print("---------------------")
                j += 1

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('-------------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]).ljust(6) + ' | '
            print(out)
        print('-------------------------------------')

In [32]:
ag = Agent()

In [33]:
ag.play(100)

Game 0 End Reward -1: #movements 5
Game 1 End Reward 1: #movements 826
Game 2 End Reward 1: #movements 38
Game 3 End Reward 1: #movements 19
Game 4 End Reward 1: #movements 21
Game 5 End Reward 1: #movements 27
Game 6 End Reward 1: #movements 30
Game 7 End Reward 1: #movements 33
Game 8 End Reward 1: #movements 8
Game 9 End Reward 1: #movements 22
Game 10 End Reward 1: #movements 9
Game 11 End Reward 1: #movements 13
Game 12 End Reward 1: #movements 6
Game 13 End Reward 1: #movements 10
Game 14 End Reward 1: #movements 7
Game 15 End Reward 1: #movements 8
Game 16 End Reward 1: #movements 5
Game 17 End Reward 1: #movements 7
Game 18 End Reward 1: #movements 8
Game 19 End Reward 1: #movements 6
Game 20 End Reward 1: #movements 5
Game 21 End Reward 1: #movements 6
Game 22 End Reward 1: #movements 5
Game 23 End Reward 1: #movements 8
Game 24 End Reward 1: #movements 6
Game 25 End Reward 1: #movements 6
Game 26 End Reward 1: #movements 13
Game 27 End Reward 1: #movements 7
Game 28 End Rewar

In [34]:
print(ag.showValues())

-------------------------------------
| 0.977  | 0.979  | 0.985  | 1.0    | 0.775  | 0.431  | 0.084  | 
-------------------------------------
| 0.951  | 0      | 0.833  | -1.0   | 0.016  | 0.208  | 0.024  | 
-------------------------------------
| 0.889  | 0.636  | 0.212  | -0.119 | 0.043  | 0.108  | 0.01   | 
-------------------------------------
| 0.777  | 0.205  | -0.014 | -0.049 | 0.0    | 0.023  | 0.009  | 
-------------------------------------
| 0.049  | 0.049  | -0.001 | 0.0    | 0.0    | 0.002  | 0.004  | 
-------------------------------------
None


In [35]:
State().showBoard()

-----------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-----------------
| 0 | z | 0 | 0 | 0 | 0 | 0 | 
-----------------
| * | 0 | 0 | 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-----------------
