https://github.com/MohammadAsadolahi/Reinforcement-Learning-solving-a-simple-4by4-Gridworld-using-policy-iteration-in-python/blob/main/README.md

In [1]:
import numpy as np
class GridWorld:
    def __init__(self):
        # S O O O
        # O O O *
        # O * O O
        # O * 0 T
        self.actionSpace = ('U', 'D', 'L', 'R')
        self.actions = {
            (0, 0): ('D', 'R'),
            (0, 1): ('L', 'D', 'R'),
            (0, 2): ('L', 'D', 'R'),
            (0, 3): ('L', 'D'),
            (1, 0): ('U', 'D', 'R'),
            (1, 1): ('U', 'L', 'D', 'R'),
            (1, 2): ('U', 'L', 'D', 'R'),
            (1, 3): ('U', 'L', 'D'),
            (2, 0): ('U', 'D', 'R'),
            (2, 1): ('U', 'L', 'D', 'R'),
            (2, 2): ('U', 'L', 'D', 'R'),
            (2, 3): ('U', 'L', 'D'),
            (3, 0): ('U', 'R'),
            (3, 1): ('U', 'L', 'R'),
            (3, 2): ('U', 'L', 'R')
        }
        self.rewards = {(3, 3): 0.03, (1, 3): -0.01, (2, 1):-0.011, (3, 1):-0.01}
        self.explored = 0
        self.exploited = 0

    def getRandomPolicy(self):
        policy = {}
        for state in self.actions:
            policy[state] = np.random.choice(self.actions[state])
        return policy

    def reset(self):
        return (0, 0)

    def is_terminal(self, s):
        return s not in self.actions

    def getNewState(self,state,action):
      i, j = zip(state)
      row = int(i[0])
      column = int(j[0])
      if action == 'U':
          row -= 1
      elif action == 'D':
          row += 1
      elif action == 'L':
          column -= 1
      elif action == 'R':
          column += 1
      return row,column

    def chooseAction(self, state, policy, exploreRate):
        #Read the algorithm carefully and write the code
        ''' Step 1: Generate a random number between 0 and 1 using np.random.rand().
        Step 2: Compare the random number with exploreRate.
        Step 3: If the random number is less than exploreRate (Exploration):
        Step 3.1: Increment exploration counter (self.explored += 1).
        Step 3.2: Select and return a random action from self.actions[state] using np.random.choice().
        Step 4: If the random number is greater than or equal to exploreRate (Exploitation):
        Step 4.1: Increment exploitation counter (self.exploited += 1).
        Step 4.2: Return the action from the current policy (policy[state]).
        '''
        if exploreRate > np.random.rand():
            self.explored += 1
            return np.random.choice(self.actions[state])
        self.exploited += 1
        return policy[state]

    def greedyChoose(self, state, values):
        ##Read the algorithm carefully and write the code
        '''
        Step 1: Retrieve available actions for the given state.
        Step 2: Initialize an empty list stateValues = [] to store values of possible next states.
        Step 3: For each possible action:
        Step 3.1: Compute next state using getNewState(state, action).
        Step 3.2: If the next state exists in values, store its value in stateValues.
        Step 4: Return the action that leads to the highest state value using np.argmax(stateValues).
        '''
        actions = self.actions[state]
        stateValues = []
        for act in actions:
            row,column=self.getNewState(state,act)
            if (row, column) in values:
                stateValues.append(values[(row, column)])
        return actions[np.argmax(stateValues)]

    def move(self, state, policy, exploreRate):
        ##Read the algorithm carefully and write the code
        '''
        Step 1: Select an action using chooseAction(state, policy, exploreRate).
        Step 2: Compute the new state using getNewState(state, action).
        Step 3: Check if the new state has a defined reward.
        Step 3.1: If yes, return the new state and its reward.
        Step 3.2: If no, return the new state with reward 0.'''
        action = self.chooseAction(state, policy, exploreRate)
        row,column=self.getNewState(state,action)
        if (row, column) in self.rewards:
            return (row, column),self.rewards[(row, column)]
        return (row, column), 0

    def printVaues(self,values):
        line = ""
        counter = 0
        for item in values:
            line += f" | {values[item]} | "
            counter += 1
            if counter > 3:
                print(line)
                print("--------------------------------")
                counter = 0
                line = ""
        print(line)
        print("----------------------------")

    def printPolicy(self, policy):
        line = ""
        counter = 0
        for item in policy:
            line += f" | {policy[item]} | "
            counter += 1
            if counter > 3:
                print(line)
                print("----------------------------")
                counter = 0
                line = ""
        print(line)
        print("----------------------------")


In [2]:
enviroment = GridWorld()
policy = enviroment.getRandomPolicy()
# enviroment.printPolicy(policy)

#example optimal policy = {(0, 0): 'R', (0, 1): 'R', (0, 2): 'D', (0, 3): 'D', (1, 0): 'R', (1, 1): 'D', (1, 2): 'D', (1, 3): 'D',
#           (2, 0): 'R', (2, 1): 'D', (2, 2): 'R', (2, 3): 'D', (3, 0): 'R', (3, 1): 'R', (3, 2): 'R'}

for i in range(1001):
  values = {}
  for state in policy:
      values[state] = 0
  values[(3, 3)] = 5

  for j in range(1000):
    state = enviroment.reset()
    stepCounts=0
    while (not enviroment.is_terminal(state)) and (stepCounts<50):
      nextState, reward = enviroment.move(state, policy, exploreRate=0.05)
      values[state] = reward + 0.1 * values[nextState]
      state=nextState
      stepCounts+=1
  for item in policy:
        policy[item] = enviroment.greedyChoose(item, values)

  if (i%100)==0:
    print(f"\n\n\n step:{i}")
    # enviroment.printVaues(values)
    enviroment.printPolicy(policy)

print(f"exploited:{enviroment.exploited}  explored:{enviroment.explored}")




 step:0
 | D |  | L |  | L |  | L | 
----------------------------
 | U |  | U |  | U |  | D | 
----------------------------
 | U |  | U |  | R |  | D | 
----------------------------
 | U |  | U |  | R | 
----------------------------



 step:100
 | D |  | R |  | D |  | D | 
----------------------------
 | U |  | D |  | D |  | D | 
----------------------------
 | R |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | R | 
----------------------------



 step:200
 | R |  | R |  | D |  | D | 
----------------------------
 | R |  | R |  | D |  | D | 
----------------------------
 | U |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | R | 
----------------------------



 step:300
 | R |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | D |  | D | 
----------------------------
 | U |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | R | 
----------------------------



 step:400
 | R |  | R |  | D |  | L | 
--------