In [1]:
import numpy as np
import random

class GridWorld:
    # call to initialize grid world enviroment, default will be a 5x5 grid with a start point at the top left corner and goal at
    # the bottom right corner, no obsticals, if probablity of obsticals is given, these will be placed in enviroment randomly
    def __init__(self, grid_size=(5, 5), start_point=(0, 0), goal_point=None, prob_obstacle=0):
        self.grid_size = grid_size
        self.start_point = start_point
        self.goal_point = goal_point if goal_point is not None else (grid_size[0] - 1, grid_size[1] - 1)
        self.prob_obstacle = prob_obstacle

        self.action_map = ['up', 'down', 'left', 'right']
        self.grid = np.zeros(self.grid_size, dtype=int)
        self._generate_obstacles()
        self.reset()


    # obstacles created based on probability argument passed
    # idealy a small number (> .8) is passed so path to goal is still achivable
    def _generate_obstacles(self):
        self.grid = np.zeros(self.grid_size, dtype=int)
        for r in range(self.grid_size[0]):
            for c in range(self.grid_size[1]):
                if random.random() < self.prob_obstacle and (r, c) != self.start_point and (r, c) != self.goal_point:
                    self.grid[r, c] = 1

    def reset(self):
        self.agent_point = self.start_point

        return self.agent_point

    def step(self, action_idx):
        action = self.action_map[action_idx]
        x, y = self.agent_point
        next_x, next_y = x, y

        if action == 'up' and x > 0:
            next_x -= 1
        elif action == 'down' and x < self.grid_size[0] - 1:
            next_x += 1
        elif action == 'left' and y > 0:
            next_y -= 1
        elif action == 'right' and y < self.grid_size[1] - 1:
            next_y += 1

        is_obstacle = self.grid[next_x, next_y] == 1

        if is_obstacle:                              # Penalty applied if agent bumps into obstacle
            next_x, next_y = x, y
            reward = -5
            done = False
        else:
            self.agent_point = (next_x, next_y)     # For non obstacle states, reward ig goal, otherwise penalize
            done = self.is_at_goal()
            reward = 1000 if done else -1

        return self.agent_point, reward, done, {}

    def is_at_goal(self):
        return self.agent_point == self.goal_point

    # For debugging
    def display_env(self):
        world = np.array(self.grid, dtype=object)
        x, y = self.agent_point
        gx, gy = self.goal_point
        world[x, y] = 'A'          # Agent
        world[gx, gy] = 'G'        # Goal
        print(world)

In [2]:
import os
import time
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

class QLearningAgent:
    def __init__(self, grid_size, action_size=4, alpha=0.1, gamma=0, epsilon=0, epsilon_min=0.01, epsilon_decay=0):

        self.grid_size = grid_size                # Size of the grid (grid_size x grid_size)
        self.action_size = action_size            # Size of the action space (default 4: up, right, down, left)
        self.alpha = alpha                        # Learning rate
        self.gamma = gamma                        # Discount factor
        self.epsilon = epsilon                    # Initial exploration rate
        self.epsilon_min = epsilon_min            # Minimum exploration rate, allows for agent to continue exploring
        self.epsilon_decay = epsilon_decay        # Decay rate for exploration

        # Initialize Q-table with zeros
        # Q-table shape: (grid_size, grid_size, action_size)
        self.q_table = np.zeros((grid_size[0], grid_size[1], action_size))

    def get_action(self, state, training=True):
        # Get the current state, determine if exploration or exploitation
        x, y = state
        if training and np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)  # Exploration: choose a random action
        else:
            return np.argmax(self.q_table[x, y])        # Exploitation: choose the best action from Q-table

    def update(self, state, action, reward, next_state, done):
        x, y = state                   # Current state
        next_x, next_y = next_state

        # Q-learning update rule
        # Q(s,a) = Q(s,a) + alpha * [r + gamma * max(Q(s',a')) - Q(s,a)]

        if not done:
            target = reward + self.gamma * np.max(self.q_table[next_x, next_y])
        else:
            target = reward

        current = self.q_table[x, y, action]
        self.q_table[x, y, action] = current + self.alpha * (target - current)

    def decay_epsilon(self):
        # Decay exploration rate
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def train(env, agent, episodes=500, max_steps=100, log_interval=100):
    rewards = []                     # Contains an array of reward totals accumulated in episodes
    steps_list = []                  # Contains an array of step totals accumulated in episodes
    q_changes = []                   # Contains an array of how the q table changed in between episodes
    start_time = time.time()
    total_reward_accumulated = 0

    for episode in range(1, episodes + 1):
        state = env.reset()
        total_reward = 0
        total_q_change = 0
        done = False

        for step in range(max_steps):
            action = agent.get_action(state)                         # Get action
            old_q = agent.q_table[state[0], state[1], action]        # Get info for old Q table
            next_state, reward, done, _ = env.step(action)           # Take step
            agent.update(state, action, reward, next_state, done)    # Update Q-table

            new_q = agent.q_table[state[0], state[1], action]        # Get info for new Q table
            total_q_change += abs(new_q - old_q)                     # Q-value delta

            # Update state and total reward
            state = next_state
            total_reward += reward

            if done:
                break

        # Decay exploration rate
        agent.decay_epsilon()

        # Save rewards, steps and q table change
        rewards.append(total_reward)
        total_reward_accumulated += total_reward
        steps_list.append(step + 1)
        q_changes.append(total_q_change)

        # Log progress
        if episode % log_interval == 0:
            avg_reward = np.mean(rewards[-log_interval:])
            avg_steps = np.mean(steps_list[-log_interval:])
            avg_q_change = np.mean(q_changes[-log_interval:])
            print(f"Episode {episode}/{episodes} | Avg Reward: {avg_reward:.2f} | Avg Steps: {avg_steps:.2f} | QΔ: {avg_q_change:.4f} | Epsilon: {agent.epsilon:.4f}")

    print(f"\nTraining completed in {time.time() - start_time:.2f} seconds.")
    print(f"\nTotal Reward Accumulated: {total_reward_accumulated}.\n")
    return rewards, q_changes

def evaluate(env, agent, episodes=10, max_steps=100, render=True):
      rewards = []                # Contains an array of reward totals accumulated in episodes
      success_count = 0

      for ep in range(episodes):
          state = env.reset()
          total_reward = 0
          done = False   # clear for episode

          path = [state]

          # let agent find goal within max number of steps
          for step in range(max_steps):
              action = agent.get_action(state, training=False)
              next_state, reward, done, _ = env.step(action)
              state = next_state
              path.append(state)
              total_reward += reward
              if done:
                  if reward > 0: success_count += 1
                  break

          rewards.append(total_reward)

          print(f"Episode {ep+1}/{episodes} | Reward: {total_reward} | Steps: {step + 1}")

      print(f"\nSuccess Rate: {success_count / episodes * 100:.2f}%\n")

      return rewards

import matplotlib.pyplot as plt

def plot_convergence(rewards, q_changes):
    # Plot Average Reward per Episode
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(rewards)
    plt.title('Average Reward per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')

    plt.subplot(1, 2, 2)
    plt.plot(q_changes)
    plt.title(f'Average QΔ per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Q-value Change')

In [3]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# Network for DQ Agent
class DQNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

class DQNAgent:
    def __init__(self, grid_size, action_size=4, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, lr=1e-4, batch_size=64, memory_size=50000):
        self.grid_size = grid_size                # Size of the grid (grid_size x grid_size)
        self.state_size = 2                       # Size of state size (x, y)
        self.action_size = action_size            # Size of the action space (default 4: up, right, down, left)
        self.gamma = gamma                        # Discount factor
        self.epsilon = epsilon                    # Initial exploration rate
        self.epsilon_min = epsilon_min            # Minimum exploration rate, allows for agent to continue exploring
        self.epsilon_decay = epsilon_decay        # Decay rate for exploration
        self.batch_size = batch_size

        self.memory = deque(maxlen=memory_size)

        self.model = DQNetwork(self.state_size, action_size)
        self.target_model = DQNetwork(self.state_size, self.action_size)  # Target network
        self.target_model.load_state_dict(self.model.state_dict())  # Initialize target model with the same weights

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

    def update(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def update_target_network(self):
        # Update the target network weights from the Q-network
        self.target_model.load_state_dict(self.model.state_dict())

    def get_action(self, state, training=True):
        # Get the current state, determine if exploration or exploitation
        if training and np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)

        # Predict Q values
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            return torch.argmax(self.model(state)).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        # Convert to PyTorch tensors for quicker computation
        states = torch.FloatTensor(states)                       # Shape: [batch, state_dim]
        actions = torch.LongTensor(actions).unsqueeze(1)         # Shape: [batch, 1]
        rewards = torch.FloatTensor(rewards).unsqueeze(1)        # Shape: [batch, 1]
        next_states = torch.FloatTensor(next_states)             # Shape: [batch, state_dim]
        dones = torch.FloatTensor(dones).unsqueeze(1)            # Shape: [batch, 1]

        # Predict Q(s,a) using current model
        q_values = self.model(states).gather(1, actions)

        # Predict max Q(s',a') for next state using target model
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0].unsqueeze(1)

        # Compute target Q-values
        targets = rewards + (1 - dones) * self.gamma * next_q_values

        # Compute loss
        loss = self.criterion(q_values, targets)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

def train_dqn(env, agent, episodes=500, max_steps=100, log_interval=100):
    rewards = []
    steps_list = []
    start_time = time.time()
    total_reward_accumulated = 0

    for episode in range(episodes):
        state = np.array(env.reset())
        total_reward = 0

        for step in range(max_steps):
            action = agent.get_action(state)                         # Get action
            next_state, reward, done, _ = env.step(action)           # Take step
            #reward = np.clip(reward, -1, 1)                          # clip reward
            next_state = np.array(next_state)
            agent.update(state, action, reward, next_state, done)    # Update state

            # Update state and total reward
            state = next_state
            total_reward += reward

            if done:
                break

        agent.replay()
        rewards.append(total_reward)
        total_reward_accumulated += total_reward
        steps_list.append(step + 1)

        # Decay exploration rate
        if agent.epsilon > agent.epsilon_min:
            agent.epsilon *= agent.epsilon_decay

        # Update target network every 10 episodes
        if episode % 10 == 0:
            agent.update_target_network()

        if episode % log_interval == 0:
            avg_reward = np.mean(rewards[-log_interval:])
            avg_steps = np.mean(steps_list[-log_interval:])
            print(f"Episode {episode}/{episodes} | Avg Reward: {avg_reward:.2f} | Avg Steps: {avg_steps:.2f} | Epsilon: {agent.epsilon:.4f}")


    print(f"\nTraining completed in {time.time() - start_time:.2f} seconds.")
    print(f"\nTotal Reward Accumulated: {total_reward_accumulated}.")
    return rewards

def evaluate_dqn(env, agent, episodes=10, max_steps=100):
    rewards = []
    success_count = 0

    for ep in range(episodes):
        state = np.array(env.reset())
        total_reward = 0
        done = False

        for step in range(max_steps):
            action = agent.get_action(state, training=False)  # Use greedy policy
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            state = np.array(next_state)

            if done:
                if reward > 0:  # Assuming reaching goal gives +10
                    success_count += 1
                break

        rewards.append(total_reward)
        print(f"Episode {ep + 1}/{episodes} | Reward: {total_reward:.2f} | Steps: {step + 1}")

    avg_reward = np.mean(rewards)
    success_rate = success_count / episodes

    print(f"\nEvaluation over {episodes} episodes - Average Reward: {avg_reward:.2f} | Success Rate: {success_rate * 100:.2f}%")

    return rewards

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

def plot_reward_rate(rewards):
    # Plot Average Reward per Episode
    '''plt.plot(rewards)
    plt.title('Average Reward per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')'''

    window = 100
    moving_avg = [np.mean(rewards[i:i+window]) for i in range(len(rewards)-window)]
    plt.plot(moving_avg)
    plt.title('Moving Average Reward (Per 100 Episodes)')


In [31]:
env = GridWorld(grid_size=(5,5), prob_obstacle=0.2)
env.display_env()

[['A' 0 0 0 0]
 [0 0 1 0 1]
 [0 0 0 1 1]
 [0 0 0 0 0]
 [0 0 0 0 'G']]


In [32]:
# Use the same hyperparameters to test the modles

q_agent = QLearningAgent(grid_size=(5,5), alpha=.1, gamma=.99, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.999)
dqn_agent =DQNAgent(grid_size=(5,5), gamma=.99, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.999)

rewards, q_changes = train(env, q_agent, episodes=500, max_steps=25, log_interval=50)
r = train_dqn(env, dqn_agent, episodes=5000, max_steps=50, log_interval=500)

Episode 50/500 | Avg Reward: 6.56 | Avg Steps: 24.92 | QΔ: 10.2683 | Epsilon: 0.9512
Episode 100/500 | Avg Reward: 67.60 | Avg Steps: 24.26 | QΔ: 17.8702 | Epsilon: 0.9048
Episode 150/500 | Avg Reward: 210.28 | Avg Steps: 23.40 | QΔ: 71.4253 | Epsilon: 0.8606
Episode 200/500 | Avg Reward: 272.68 | Avg Steps: 22.82 | QΔ: 192.2578 | Epsilon: 0.8186
Episode 250/500 | Avg Reward: 313.96 | Avg Steps: 22.54 | QΔ: 302.6838 | Epsilon: 0.7787
Episode 300/500 | Avg Reward: 455.38 | Avg Steps: 21.34 | QΔ: 257.5810 | Epsilon: 0.7407
Episode 350/500 | Avg Reward: 617.02 | Avg Steps: 19.70 | QΔ: 181.1049 | Epsilon: 0.7046
Episode 400/500 | Avg Reward: 394.80 | Avg Steps: 21.54 | QΔ: 120.2530 | Epsilon: 0.6702
Episode 450/500 | Avg Reward: 616.18 | Avg Steps: 20.14 | QΔ: 62.8999 | Epsilon: 0.6375
Episode 500/500 | Avg Reward: 717.70 | Avg Steps: 19.36 | QΔ: 29.3507 | Epsilon: 0.6064

Training completed in 0.40 seconds.

Total Reward Accumulated: 183608.

Episode 0/5000 | Avg Reward: -62.00 | Avg Step

In [33]:
ev_reward = evaluate(env, q_agent, episodes=10)
r = evaluate_dqn(env, dqn_agent, max_steps=50)

Episode 1/10 | Reward: 993 | Steps: 8
Episode 2/10 | Reward: 993 | Steps: 8
Episode 3/10 | Reward: 993 | Steps: 8
Episode 4/10 | Reward: 993 | Steps: 8
Episode 5/10 | Reward: 993 | Steps: 8
Episode 6/10 | Reward: 993 | Steps: 8
Episode 7/10 | Reward: 993 | Steps: 8
Episode 8/10 | Reward: 993 | Steps: 8
Episode 9/10 | Reward: 993 | Steps: 8
Episode 10/10 | Reward: 993 | Steps: 8

Success Rate: 100.00%

Episode 1/10 | Reward: 977.00 | Steps: 24
Episode 2/10 | Reward: 980.00 | Steps: 21
Episode 3/10 | Reward: 964.00 | Steps: 37
Episode 4/10 | Reward: 971.00 | Steps: 30
Episode 5/10 | Reward: 958.00 | Steps: 43
Episode 6/10 | Reward: 955.00 | Steps: 46
Episode 7/10 | Reward: 984.00 | Steps: 17
Episode 8/10 | Reward: 959.00 | Steps: 42
Episode 9/10 | Reward: 957.00 | Steps: 44
Episode 10/10 | Reward: 982.00 | Steps: 19

Evaluation over 10 episodes - Average Reward: 968.70 | Success Rate: 100.00%


In [39]:
env = GridWorld(grid_size=(15,15), prob_obstacle=0.2)
env.display_env()

[['A' 0 1 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 1 0]
 [0 1 0 0 0 1 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 0 0 1 0 0 1]
 [1 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 1 0 0 'G']]


In [45]:
q_agent = QLearningAgent(grid_size=(15,15), alpha=.1, gamma=.97, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.999)
dqn_agent = DQNAgent(grid_size=(15,15), gamma=.98, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.999)

rewards, q_changes = train(env, q_agent, episodes=1000, max_steps=250, log_interval=100)
r = train_dqn(env, dqn_agent, episodes=3000, max_steps=250, log_interval=200)

Episode 100/1000 | Avg Reward: -333.17 | Avg Steps: 246.61 | QΔ: 25.6496 | Epsilon: 0.9048
Episode 200/1000 | Avg Reward: 83.06 | Avg Steps: 214.66 | QΔ: 179.8769 | Epsilon: 0.8186
Episode 300/1000 | Avg Reward: 592.84 | Avg Steps: 160.78 | QΔ: 644.5393 | Epsilon: 0.7407
Episode 400/1000 | Avg Reward: 837.34 | Avg Steps: 112.97 | QΔ: 569.3860 | Epsilon: 0.6702
Episode 500/1000 | Avg Reward: 896.65 | Avg Steps: 80.75 | QΔ: 405.3540 | Epsilon: 0.6064
Episode 600/1000 | Avg Reward: 908.81 | Avg Steps: 74.11 | QΔ: 350.3008 | Epsilon: 0.5486
Episode 700/1000 | Avg Reward: 928.94 | Avg Steps: 61.34 | QΔ: 200.5112 | Epsilon: 0.4964
Episode 800/1000 | Avg Reward: 936.77 | Avg Steps: 55.39 | QΔ: 87.8012 | Epsilon: 0.4491
Episode 900/1000 | Avg Reward: 938.77 | Avg Steps: 53.71 | QΔ: 49.3134 | Epsilon: 0.4064
Episode 1000/1000 | Avg Reward: 945.42 | Avg Steps: 48.70 | QΔ: 22.3255 | Epsilon: 0.3677

Training completed in 3.92 seconds.

Total Reward Accumulated: 673543.

Episode 0/3000 | Avg Rewar

In [47]:
ev_reward = evaluate(env, q_agent, episodes=10, max_steps=1250)
r = evaluate_dqn(env, dqn_agent, max_steps=1500)

Episode 1/10 | Reward: 973 | Steps: 28
Episode 2/10 | Reward: 973 | Steps: 28
Episode 3/10 | Reward: 973 | Steps: 28
Episode 4/10 | Reward: 973 | Steps: 28
Episode 5/10 | Reward: 973 | Steps: 28
Episode 6/10 | Reward: 973 | Steps: 28
Episode 7/10 | Reward: 973 | Steps: 28
Episode 8/10 | Reward: 973 | Steps: 28
Episode 9/10 | Reward: 973 | Steps: 28
Episode 10/10 | Reward: 973 | Steps: 28

Success Rate: 100.00%

Episode 1/10 | Reward: -4556.00 | Steps: 1500
Episode 2/10 | Reward: 915.00 | Steps: 42
Episode 3/10 | Reward: -1277.00 | Steps: 482
Episode 4/10 | Reward: -5128.00 | Steps: 1500
Episode 5/10 | Reward: 697.00 | Steps: 116
Episode 6/10 | Reward: -4760.00 | Steps: 1181
Episode 7/10 | Reward: 240.00 | Steps: 181
Episode 8/10 | Reward: 934.00 | Steps: 43
Episode 9/10 | Reward: -1260.00 | Steps: 481
Episode 10/10 | Reward: 951.00 | Steps: 34

Evaluation over 10 episodes - Average Reward: -1324.40 | Success Rate: 80.00%


In [48]:
env = GridWorld(grid_size=(30,30), prob_obstacle=0.1)
env.display_env()

[['A' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0

In [21]:
# Use the same hyperparameters to test the modles

q_agent = QLearningAgent(grid_size=(30,30), alpha=.1, gamma=.97, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.999)
dqn_agent = DQNAgent(grid_size=(30,30), gamma=.97, epsilon=0.5, epsilon_min=0.001, epsilon_decay=0.999)

rewards, q_changes = train(env, q_agent, episodes=2000, max_steps=450, log_interval=100)
rewards = train_dqn(env, dqn_agent, episodes=1500, max_steps=900, log_interval=100)

Episode 100/2000 | Avg Reward: -640.96 | Avg Steps: 450.00 | QΔ: 39.8694 | Epsilon: 0.9048
Episode 200/2000 | Avg Reward: -608.76 | Avg Steps: 450.00 | QΔ: 37.7453 | Epsilon: 0.8186
Episode 300/2000 | Avg Reward: -481.41 | Avg Steps: 444.62 | QΔ: 48.3495 | Epsilon: 0.7407
Episode 400/2000 | Avg Reward: -197.12 | Avg Steps: 417.24 | QΔ: 101.8138 | Epsilon: 0.6702
Episode 500/2000 | Avg Reward: -4.61 | Avg Steps: 387.72 | QΔ: 186.6828 | Epsilon: 0.6064
Episode 600/2000 | Avg Reward: 227.29 | Avg Steps: 353.68 | QΔ: 247.6442 | Epsilon: 0.5486
Episode 700/2000 | Avg Reward: 435.74 | Avg Steps: 312.18 | QΔ: 302.4905 | Epsilon: 0.4964
Episode 800/2000 | Avg Reward: 572.56 | Avg Steps: 267.36 | QΔ: 284.4716 | Epsilon: 0.4491
Episode 900/2000 | Avg Reward: 703.85 | Avg Steps: 243.01 | QΔ: 299.3070 | Epsilon: 0.4064
Episode 1000/2000 | Avg Reward: 762.89 | Avg Steps: 210.43 | QΔ: 274.7605 | Epsilon: 0.3677
Episode 1100/2000 | Avg Reward: 784.67 | Avg Steps: 185.80 | QΔ: 239.2158 | Epsilon: 0.33

In [24]:
ev_reward = evaluate(env, q_agent, episodes=10, max_steps=500)
r = evaluate_dqn(env, dqn_agent, max_steps=500)

Episode 1/10 | Reward: 943 | Steps: 58
Episode 2/10 | Reward: 943 | Steps: 58
Episode 3/10 | Reward: 943 | Steps: 58
Episode 4/10 | Reward: 943 | Steps: 58
Episode 5/10 | Reward: 943 | Steps: 58
Episode 6/10 | Reward: 943 | Steps: 58
Episode 7/10 | Reward: 943 | Steps: 58
Episode 8/10 | Reward: 943 | Steps: 58
Episode 9/10 | Reward: 943 | Steps: 58
Episode 10/10 | Reward: 943 | Steps: 58

Success Rate: 100.00%

Episode 1/10 | Reward: -1452.00 | Steps: 500
Episode 2/10 | Reward: 774.00 | Steps: 107
Episode 3/10 | Reward: 680.00 | Steps: 129
Episode 4/10 | Reward: 884.00 | Steps: 89
Episode 5/10 | Reward: 800.00 | Steps: 101
Episode 6/10 | Reward: 877.00 | Steps: 84
Episode 7/10 | Reward: 757.00 | Steps: 116
Episode 8/10 | Reward: 194.00 | Steps: 315
Episode 9/10 | Reward: 811.00 | Steps: 102
Episode 10/10 | Reward: 812.00 | Steps: 93

Evaluation over 10 episodes - Average Reward: 513.70 | Success Rate: 90.00%
