# Levelling Up the Model

Now we introduce barriers that the agent is not allowed to pass.

In [140]:
import numpy as np
import matplotlib.pyplot as plt
import types

In [141]:
agent_map = {
    0 : (-1, 0),
    1 : (1, 0),
    2 : (0,-1),
    3 : (0,1)}

In [142]:
space = 0
obstacle = 1
grid_size = 25

grid = np.zeros(shape=(grid_size, grid_size), dtype=int)

grid[0, :]  = obstacle
grid[-1, :] = obstacle
grid[:, 0]  = obstacle
grid[:, -1] = obstacle

grid[:12, 5]  = obstacle
grid[15, :16] = obstacle
grid[20, 15:21] = obstacle
grid[5:20, 20]  = obstacle

grid[15:20, 10] = obstacle
grid[6, 5:15] = obstacle

start = (20, 5)
end   = (2, 10)

In [143]:
reward_goal = 100
reward_step = -1
reward_obstacle = -100

In [144]:
alpha = 0.1
gamma = 0.9

In [149]:
class Agent:
    def __init__(self, curr_pos:tuple = start):
        self.start = start
        self.curr_pos = curr_pos
        self.Q_table = np.zeros(shape=(grid_size, grid_size, len(agent_map)))
        self.path = []

    def calc_new_pos(self, action):
        row_change, col_change = agent_map[action]
        new_row = self.curr_pos[0] + row_change
        new_col = self.curr_pos[1] + col_change
        return (new_row, new_col)
    
    def choose_action(self, epsilon):
        crit = np.random.uniform(0, 1)

        if crit < epsilon:
            action = np.random.choice([0, 1, 2, 3])
            return int(action)
        else:
            q_vals = self.Q_table[self.curr_pos[0], self.curr_pos[1]]
            action = np.argmax(q_vals)
            return int(action)
        
    def update_q_table(self, action, reward, new_pos):
        old_q = self.Q_table[self.curr_pos[0], self.curr_pos[1], action]
        max_q = np.max(self.Q_table[new_pos[0], new_pos[1]])
        new_q = old_q + alpha * (reward + gamma*(max_q) - old_q)
        self.Q_table[self.curr_pos[0], self.curr_pos[1], action] = new_q

    def distance(self, goal:tuple):

        print(self.start, " to ",self.path[-1])
        return ((self.curr_pos[0] - goal[0]) ** 2 + (self.curr_pos[1] - goal[1]) ** 2) ** (1/2)

In [153]:
epsilon = 1.0
epislon_decay = 0.99995
episodes = 40000

james = Agent()

for e in range(episodes):
    while james.curr_pos != end:
        action  = james.choose_action(epsilon)
        tent_pos = james.calc_new_pos(action)

        row_check  = tent_pos[0] < 0 or tent_pos[0] >= grid_size
        col_check = tent_pos[1] < 0 or tent_pos[1] >= grid_size

        if row_check or col_check:
            reward = reward_obstacle
            new_pos = james.curr_pos
        else:
            if grid[tent_pos[0], tent_pos[1]] == obstacle:
                reward = reward_obstacle
                new_pos = james.curr_pos
        
            elif tent_pos == end:
                reward = reward_goal
                new_pos = tent_pos
            else:
                reward = reward_step
                new_pos = tent_pos
        
        james.update_q_table(action, reward, new_pos)
        james.curr_pos = new_pos

    epsilon *= epislon_decay

In [154]:
james.curr_pos = start
path = [start]
max_steps = grid_size ** 2
step_count = 0
while james.curr_pos != end and step_count < max_steps:
    step_count += 1
    action = james.choose_action(0.0)
    tent_pos = james.calc_new_pos(action)

    row_check  = tent_pos[0] < 0 or tent_pos[0] >= grid_size
    col_check = tent_pos[1] < 0 or tent_pos[1] >= grid_size

    if row_check or col_check:
        new_pos = james.curr_pos
    else:
        if grid[tent_pos[0], tent_pos[1]] == obstacle:
            new_pos = james.curr_pos
        else:
            new_pos = tent_pos

    path.append(new_pos)
    james.path = path
    james.curr_pos = new_pos

In [155]:
james.distance(end)

(20, 5)  to  (16, 5)


14.866068747318506