# Wumpus World - RL Training

Train a PPO agent with frame stacking (implicit memory) on a partially observable Wumpus World.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, Image, HTML
from PIL import Image as PILImage

print("Ready!")

Ready!


## 1. Test Environment

In [2]:
from wumpus_env import WumpusWorldEnv

env = WumpusWorldEnv(render_mode='rgb_array')
obs, _ = env.reset()

print(f"Observation shape: {obs.shape}")
print("Channels: Agent, Visited, Breeze, Stench, Safe, Danger, Gold, Walls")

# Render
frame = env.render()
plt.figure(figsize=(6, 7))
plt.imshow(frame)
plt.title("Initial State (partial observability)")
plt.axis('off')
plt.tight_layout()
plt.show()

env.close()

Observation shape: (8, 5, 5)
Channels: Agent, Visited, Breeze, Stench, Safe, Danger, Gold, Walls


## 2. Train Agent

In [None]:
from train import train

# Train with frame stacking
# Adjust timesteps based on your machine (50k-100k recommended)
model, history = train(
    total_timesteps=80000,
    n_envs=4,
    frame_stack=4
)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Wumpus World - PPO with Frame Stacking

Envs: 4 parallel, 4 frames stacked


Output()

Observation shape: (8, 5, 20)
Training for 80,000 steps...
--------------------------------------------------


KeyboardInterrupt: 

## 3. Training Results

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 9))

# Rewards
ax = axes[0, 0]
rewards = history['episode_rewards']
ax.plot(rewards, alpha=0.3, color='steelblue')
if len(rewards) > 50:
    ma = np.convolve(rewards, np.ones(50)/50, mode='valid')
    ax.plot(range(49, len(rewards)), ma, color='darkblue', lw=2, label='MA(50)')
ax.axhline(100, color='green', ls='--', alpha=0.5, label='Gold (+100)')
ax.axhline(-50, color='red', ls='--', alpha=0.5, label='Death (-50)')
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Episode Rewards')
ax.legend()
ax.grid(alpha=0.3)

# Win rate
ax = axes[0, 1]
wins = history['wins']
if len(wins) > 20:
    win_rate = [np.mean(wins[max(0,i-50):i+1])*100 for i in range(len(wins))]
    ax.plot(win_rate, color='forestgreen', lw=2)
    ax.fill_between(range(len(win_rate)), win_rate, alpha=0.3, color='green')
ax.set_xlabel('Episode')
ax.set_ylabel('Win Rate (%)')
ax.set_title('Win Rate (Rolling 50)')
ax.set_ylim(0, 100)
ax.grid(alpha=0.3)

# Exploration
ax = axes[1, 0]
expl = history['exploration_rates']
if len(expl) > 20:
    expl_ma = [np.mean(expl[max(0,i-50):i+1])*100 for i in range(len(expl))]
    ax.plot(expl_ma, color='purple', lw=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Exploration (%)')
ax.set_title('Map Exploration')
ax.set_ylim(0, 100)
ax.grid(alpha=0.3)

# Eval rewards
ax = axes[1, 1]
if history['eval_rewards']:
    ax.plot(history['eval_timesteps'], history['eval_rewards'], 'o-', color='coral', lw=2, ms=8)
ax.set_xlabel('Timesteps')
ax.set_ylabel('Eval Reward')
ax.set_title('Evaluation Performance')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('plots/training_results.png', dpi=150)
plt.show()

print(f"\nFinal Win Rate: {history.get('final_win_rate', 0)*100:.1f}%")

## 4. Watch Agent Play

In [None]:
from train import record_episode

# Record one episode
frames, info = record_episode('models/ppo_wumpus', 'episode_frames')

print(f"\nResult: {'WON!' if info.get('win') else 'LOST'}")
print(f"Explored: {info.get('visited_cells', 0)}/25 cells")

In [None]:
# Display the episode GIF
Image(filename='episode_frames/episode.gif')

In [None]:
# Show key frames
n_show = min(8, len(frames))
indices = np.linspace(0, len(frames)-1, n_show, dtype=int)

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, idx in enumerate(indices):
    ax = axes[i//4, i%4]
    ax.imshow(frames[idx])
    ax.set_title(f"Step {idx}")
    ax.axis('off')
plt.suptitle(f"Episode: {'WON!' if info.get('win') else 'LOST'}", fontsize=14)
plt.tight_layout()
plt.show()

## 5. Full Evaluation

In [None]:
from train import evaluate_and_record

# Run multiple episodes
results = evaluate_and_record('models/ppo_wumpus', n_episodes=20)

In [None]:
# Show best episode
Image(filename='recordings/best_episode.gif')

In [None]:
# Results summary
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

rewards = [r['reward'] for r in results]
explored = [r['explored'] for r in results]
wins = [1 if r['win'] else 0 for r in results]

axes[0].bar(range(len(rewards)), rewards, color=['green' if w else 'salmon' for w in wins])
axes[0].axhline(0, color='black', lw=0.5)
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Reward')
axes[0].set_title('Rewards by Episode')

axes[1].bar(range(len(explored)), explored, color='steelblue')
axes[1].axhline(25, color='green', ls='--', alpha=0.5)
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Cells')
axes[1].set_title('Exploration by Episode')

win_rate = sum(wins) / len(wins) * 100
axes[2].pie([win_rate, 100-win_rate], labels=['Win', 'Loss'], 
            colors=['forestgreen', 'salmon'], autopct='%.1f%%', startangle=90)
axes[2].set_title(f'Win Rate: {win_rate:.1f}%')

plt.tight_layout()
plt.show()

In [None]:
print("Done!")