In [1]:
import numpy as np
from wumpusworld.simplified_wumpus_world import SimplifiedWumpusWorld

In [2]:
env = SimplifiedWumpusWorld()

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [4]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [5]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

In [6]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : -727.623
2000 : -15.073999999999963
3000 : 402.93899999999996
4000 : 582.944
5000 : 644.962
6000 : 632.9649999999999
7000 : 668.8629999999999
8000 : 624.946
9000 : 646.9449999999999
10000: 614.9369999999999

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[  946.08905489   935.62816434   935.62816434   946.08905489]
 [  956.655611     942.50277018   934.00947455  -999.99999116]
 [    0.             0.             0.             0.        ]
 [    0.             0.             0.             0.        ]
 [-1000.           935.62816434   946.08905489   956.655611  ]
 [-1000.           946.08905489   946.08905489   967.3289    ]
 [  978.11       -1000.           956.655611   -1000.        ]
 [    0.             0.             0.             0.        ]
 [    0.             0.             0.             0.        ]
 [    0.             0.             0.             0.        ]
 [-1000.           967.3289     -1000.           989

In [7]:
from PIL import Image

In [8]:
canvas = Image.open("images/world.png")
agent = Image.open("images/rebort-scaled.png")

In [9]:
locations = {(x, y): (20 * (x + 1) + 500 * x, 1600 - (20 * (y + 1) + 500 * y)) for y in range(4) for x in range(4)}

In [10]:
images = []

agentX = 0
agentY = 0

b = canvas.copy()
b.paste(agent, locations[(agentX, agentY)], agent)
images.append(b)

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(3, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(3, agentX + 1)

    b = canvas.copy()
    b.paste(agent, locations[(agentX, agentY)], agent)
    images.append(b)

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

print(f"Reward: {rewards_current_episode}")

images[0].save('images/anitest.gif',
               save_all=True,
               append_images=images[1:],
               duration=500,
               loop=0)

Reward: 995


<img src="images/anitest.gif" width="50%">