In [1]:
import numpy as np
import gym

class MazeEnv(gym.Env):
    
    def __init__(self, point_holes, row, col):
        self.nrow = row
        self.ncol = col
        self.col = 0
        self.row = 0
        self.grids = self.__set_points(point_holes)
    
    def __set_points(self, point_holes):
        grids = [["F" for _ in range(self.ncol)] for _ in range(self.nrow)]
        grids[self.col][self.row] = "S"   # Set start point
        grids[-1][-1] = "G"  # Set goal point
        for point_hole in point_holes:   # Set hole points
            row, col = point_hole
            grids[row][col] = "H"
        final_grids = ["".join(grid_row) for grid_row in grids]
        return final_grids
    
    def step(self, action):
        self.transition(action)
        new_state = self.get_state()
        reward = 0
        done = False
        grid = self.grids[self.row][self.col]
        if grid == "H":
            # done = True
            reward = -10
        elif grid == "G":
            done = True   # Stop step when agent come only goal
            reward = 100
        return new_state, reward, done, ""
    
    def reset(self):
        self.col=0
        self.row=0
        return 0
    
    def transition(self, action):
        if action == 0: #LEFT
            self.co = max(0, self.col-1)
        elif action == 1: #DOWN
            self.row = min(self.row+1,self.nrow-1)
        elif action == 2: #RIGHT
            self.col = min(self.col+1,self.ncol-1)
        elif action == 3: #UP
            self.row = max(self.row-1,0)
    
    def get_state(self):
        return self.row*self.ncol+self.col

In [2]:
def epsilon_greedy(Q, state, action_num, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(action_num)
    else:
        return np.random.choice(np.where(Q[state,] == Q[state,].max())[0])

In [3]:
num_episodes = 1000
alpha = 0.2
gamma = 0.9
epsilon = 0.2
col, row = 5, 5
point_holes = [(0, 3), (1, 1), (2, 1), (2, 3), (3, 3), (4, 1)]
rewards=[]
env=MazeEnv(point_holes, col, row)
state_num, action_num = col * row, 4
Q=np.zeros((state_num, action_num))

In [4]:
print("\n".join(env.grids))

SFFHF
FHFFF
FHFHF
FFFHF
FHFFG


In [5]:
for i in range(num_episodes):
    s = env.reset()
    episode_reward = 0
    while True:
        a = epsilon_greedy(Q, s, action_num, epsilon)
        s1, r, d, _ = env.step(a)
        episode_reward += r
        Q[s,a] += alpha * (r + gamma * Q[s1,].max() - Q[s,a])
        s = s1
        if d:
            break
    rewards.append(episode_reward)
print(rewards[:10], rewards[-10:], sep='\n')  # Show rewards of head & tail

[10, 100, 100, 100, 100, 100, 80, 90, 100, 100]
[90, 100, 100, 100, 100, 100, 100, 90, 100, 100]


In [6]:
action_to_arrow = {
    0: "←",
    1: "↓",
    2: "→",
    3: "↑"
}
for row in range(5):
    for col in range(5):
        print(action_to_arrow[Q[row*5+col,].argmax()], end="")
    print()

→→↓↓↓
↑→→→↓
↑→↑↑↓
↑←←↓↓
←→→→←


In [7]:
print(np.round(Q.reshape([5, 5, 4]), 2))  # separate by col & row

[[[ 43.05  38.74  47.83  43.05]
  [ 47.83  43.07  53.14  47.83]
  [ 53.14  59.05  49.05  53.14]
  [ 42.28  65.61  51.7   43.45]
  [ 21.01  72.9   37.72  32.02]]

 [[ 18.48   9.14  19.66  43.05]
  [ 23.34  -3.95  59.04  23.34]
  [ 59.05  53.12  65.61  53.14]
  [ 65.61  49.05  72.9   49.05]
  [ 72.9   81.    72.9   65.61]]

 [[  0.     0.    -2.    26.02]
  [ -2.     0.    14.78  -2.  ]
  [ 10.51   0.    28.68  59.05]
  [ 17.66   0.36  16.2   65.61]
  [ 81.    90.    81.    72.9 ]]

 [[  0.     0.     0.     4.11]
  [  0.    -1.88   0.     0.  ]
  [  0.     0.    -2.     0.  ]
  [  0.    29.76   0.     0.  ]
  [ 90.   100.    90.    81.  ]]

 [[  0.     0.    -2.     0.  ]
  [  0.     0.     2.2    0.  ]
  [  0.     0.    16.27   0.  ]
  [ 13.28   0.    79.03   0.  ]
  [  0.     0.     0.     0.  ]]]
