Question 1: In this code we want to implement value iteration for two gridworlds with an arbitrary but prescribed
number of terminal states (and their associated rewards) and inaccessible states in Python.
At the begining I have specified the number of rows and columns.
I consider a list of inaccessible states as a list with name bad and terminal states with their associated values with as a list with "Good". All
non-terminal transitions incur a reward of −1.

At first I have defined a class state. I have defined some functions as follows:
giveReward: assigns a reward in a way that problem wants
for each cell in grid we can go "up","down","left", "right" that are defined in the next position value. it show us till we are in the chart of grid we can change our position.
In class agent by considering the learning rate, we assign initial value reward. In choose action, we choose action with most expected value, in this action we have used greedy algorithm because we are looking forward to have shotest path. In function Play if the position is not equals to the position with the highest reward it will research for any possible position with the best reward.

In [68]:
import numpy as np

# global variables
BOARD_ROWS = 4
BOARD_COLS = 4
Good = [(0, 3), (3,3)]
bad = [(1, 1), (1, 2), (2, 3)]
START = (3, 0)
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state in Good:
            return 0
        elif self.state in bad :
            return -np.inf
        else:
            return -1

    def isEndFunc(self):
        if (self.state in Good):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                    if nxtState not in bad :
                        return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 10:
                    token = 'W'
                if self.board[i, j] == -10:
                    token = 'L'
                if self.board[i, j] == -1:
                    token = '-1'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0


    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
            while action in bad:
              action = np.random.choice(self.actions)

        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = -1  # this is optional
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                for w in bad:
                  self.state_values[w] = 0.2
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(round(-0.8-self.state_values[(i, j)],3)).ljust(6) + ' | '
            print(out)
        print('----------------------------------')


ag = Agent()
ag.play(50)
ag.showValues()

current position (3, 0) action up
nxt state (2, 0)
---------------------
current position (2, 0) action right
nxt state (2, 1)
---------------------
current position (2, 1) action right
nxt state (2, 2)
---------------------
current position (2, 2) action down
nxt state (3, 2)
---------------------
current position (3, 2) action right
nxt state (3, 3)
---------------------
Game End Reward 0
current position (3, 0) action right
nxt state (3, 1)
---------------------
current position (3, 1) action up
nxt state (2, 1)
---------------------
current position (2, 1) action down
nxt state (3, 1)
---------------------
current position (3, 1) action left
nxt state (3, 0)
---------------------
current position (3, 0) action up
nxt state (2, 0)
---------------------
current position (2, 0) action down
nxt state (3, 0)
---------------------
current position (3, 0) action right
nxt state (3, 1)
---------------------
current position (3, 1) action left
nxt state (3, 0)
---------------------
current 

Question 2: the grid is a bit bigger in comparison to question 1 and it has a win_state and lose_state and some bad positions that are inaccesible.
All other things are some how like question 1.
At first I have defined a class state. I have defined some functions as follows:
giveReward: assigns a reward in a way that problem wants
for each cell in grid we can go "up","down","left", "right" that are defined in the next position value. it show us till we are in the chart of grid we can change our position.
In class agent by considering the learning rate, we assign initial value reward. In choose action, we choose action with most expected value, in this action we have used greedy algorithm because we are looking forward to have shotest path. In function Play if the position is not equals to the position with the highest reward it will research for any possible position with the best reward.

In [50]:
import numpy as np

# global variables
BOARD_ROWS = 5
BOARD_COLS = 6
WIN_STATE = (0,5)
LOSE_STATE = (4, 5)
bad = [(0, 1), (1, 1), (2, 1), (3, 1), (2, 3), (2, 4), (2, 5), (4,5)]
START = (4, 0)
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 10
        elif self.state in bad :
            return -np.inf
        elif self.state == LOSE_STATE:
            return -10
        else:
            return -1

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                    if nxtState not in bad :
                        return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 10:
                    token = 'W'
                if self.board[i, j] == -10:
                    token = 'L'
                if self.board[i, j] == -1:
                    token = '-1'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0


    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
            while action in bad:
              action = np.random.choice(self.actions)

        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward  # this is optional
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------')


ag = Agent()
ag.play(50)
ag.showValues()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
---------------------
current position (3, 5) action right
nxt state (3, 5)
---------------------
current position (3, 5) action right
nxt state (3, 5)
---------------------
current position (3, 5) action right
nxt state (3, 5)
---------------------
current position (3, 5) action right
nxt state (3, 5)
---------------------
current position (3, 5) action left
nxt state (3, 4)
---------------------
current position (3, 4) action down
nxt state (4, 4)
---------------------
current position (4, 4) action right
nxt state (4, 4)
---------------------
current position (4, 4) action right
nxt state (4, 4)
---------------------
current position (4, 4) action right
nxt state (4, 4)
---------------------
current position (4, 4) action down
nxt state (4, 4)
---------------------
current position (4, 4) action right
nxt state (4, 4)
---------------------
current position (4, 4) action right
nxt state (4, 4)
---------------------
curr