# ðŸ¤– Language-Conditioned RL Agent Training

This notebook trains a language-conditioned agent on BabyAI environments.

## Setup Instructions

1. **Runtime** â†’ **Change runtime type** â†’ Select **T4 GPU**
2. Run all cells in order
3. Download the trained model weights at the end

**Expected Time:**
- PPO Training (200 iterations): ~30-45 minutes
- PPO Training (500 iterations): ~1-2 hours

## 1. Install Dependencies

In [None]:
# Install required packages
!pip install minigrid gymnasium numpy pandas tqdm -q
!pip install torch -q
!pip install 'ray[rllib]' -q

print("\nâœ… Dependencies installed!")

## 2. Create Environment Wrapper

**Key Fix**: RLlib requires a simple `Box` observation space, not `Dict`. We flatten the 7x7x3 image + direction into a single vector.

In [None]:
# Create RLlib-compatible environment with FLATTENED observation space
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import minigrid

class MiniGridRLlibEnv(gym.Env):
    """
    MiniGrid wrapper with FLAT Box observation space for RLlib compatibility.
    
    Observation: Box(151,) = flattened 7x7x3 image (147) + one-hot direction (4)
    Action: Discrete(7) = left, right, forward, pickup, drop, toggle, done
    """
    
    def __init__(self, config=None):
        super().__init__()
        config = config or {}
        env_name = config.get("env_name", "BabyAI-GoToObj-v0")
        max_steps = config.get("max_steps", 64)
        
        # Create base environment
        self.env = gym.make(env_name, render_mode="rgb_array")
        self.env.unwrapped.max_steps = max_steps
        
        # Flattened observation: 7*7*3 (image) + 4 (one-hot direction) = 151
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(151,), dtype=np.float32
        )
        self.action_space = self.env.action_space  # Discrete(7)
        
        self._current_instruction = ""
    
    def _flatten_obs(self, obs):
        """Convert MiniGrid obs to flat vector."""
        # Normalize image to [0, 1]
        image = obs["image"].flatten().astype(np.float32) / 10.0  # Max value is ~10
        
        # One-hot encode direction (0-3)
        direction = np.zeros(4, dtype=np.float32)
        direction[obs["direction"]] = 1.0
        
        return np.concatenate([image, direction])
    
    def reset(self, *, seed=None, options=None):
        obs, info = self.env.reset(seed=seed, options=options)
        self._current_instruction = self.env.unwrapped.mission
        return self._flatten_obs(obs), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        return self._flatten_obs(obs), reward, terminated, truncated, info
    
    def render(self):
        return self.env.render()
    
    @property
    def instruction(self):
        return self._current_instruction

# Test it
env = MiniGridRLlibEnv()
obs, info = env.reset(seed=42)
print(f"Observation space: {env.observation_space}")
print(f"Observation shape: {obs.shape}")
print(f"Action space: {env.action_space}")
print(f"Instruction: {env.instruction}")
print("\nâœ… Environment created!")

## 3. Configure and Build PPO

In [None]:
import ray
from ray.rllib.algorithms.ppo import PPOConfig
import torch

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Configure PPO with the flat observation environment
config = (
    PPOConfig()
    .environment(
        env=MiniGridRLlibEnv,
        env_config={
            "env_name": "BabyAI-GoToObj-v0",
            "max_steps": 64,
        },
    )
    .framework("torch")
    .env_runners(
        num_env_runners=2,
        num_envs_per_env_runner=4,
    )
    .training(
        train_batch_size=2048,
        lr=3e-4,
        gamma=0.99,
        clip_param=0.2,
        num_sgd_iter=10,
        entropy_coeff=0.01,
        model={
            "fcnet_hiddens": [256, 256],
            "fcnet_activation": "relu",
        },
    )
    .resources(num_gpus=1 if torch.cuda.is_available() else 0)
)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("\nâœ… PPO config created!")

In [None]:
# Build the algorithm
print("Building PPO algorithm...")
algo = config.build()
print("âœ… PPO algorithm built!")

## 4. Train!

In [None]:
import os
os.makedirs('experiments/checkpoints', exist_ok=True)

# Training parameters
NUM_ITERATIONS = 200  # Increase to 500+ for better results

results = []
best_reward = float('-inf')

print(f"Starting training for {NUM_ITERATIONS} iterations...\n")

for i in range(NUM_ITERATIONS):
    result = algo.train()
    
    # Extract metrics
    reward_mean = result.get("env_runners", {}).get("episode_reward_mean", 
                   result.get("episode_reward_mean", 0))
    episode_len = result.get("env_runners", {}).get("episode_len_mean",
                   result.get("episode_len_mean", 0))
    timesteps = result.get("timesteps_total", 0)
    
    results.append({
        "iteration": i + 1,
        "reward_mean": reward_mean if reward_mean else 0,
        "episode_len": episode_len if episode_len else 0,
        "timesteps": timesteps,
    })
    
    # Progress update every 10 iterations
    if (i + 1) % 10 == 0:
        print(f"Iter {i+1:3d}/{NUM_ITERATIONS}: reward={reward_mean:6.2f}, len={episode_len:5.1f}, steps={timesteps}")
    
    # Save checkpoint every 50 iterations
    if (i + 1) % 50 == 0:
        checkpoint = algo.save("experiments/checkpoints")
        print(f"  ðŸ’¾ Checkpoint saved!")
    
    # Track best
    if reward_mean and reward_mean > best_reward:
        best_reward = reward_mean

print(f"\nâœ… Training complete!")
print(f"Best reward: {best_reward:.2f}")

## 5. Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Plot training curve
iterations = [r["iteration"] for r in results]
rewards = [r["reward_mean"] for r in results]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(iterations, rewards, 'b-', alpha=0.7)
# Smoothed line
if len(rewards) > 10:
    smooth_rewards = np.convolve(rewards, np.ones(10)/10, mode='valid')
    plt.plot(range(10, len(rewards)+1), smooth_rewards, 'r-', linewidth=2, label='Smoothed')
plt.xlabel("Iteration")
plt.ylabel("Mean Episode Reward")
plt.title("PPO Training Curve")
plt.grid(True, alpha=0.3)
plt.legend()

plt.subplot(1, 2, 2)
ep_lens = [r["episode_len"] for r in results]
plt.plot(iterations, ep_lens, 'g-', alpha=0.7)
plt.xlabel("Iteration")
plt.ylabel("Mean Episode Length")
plt.title("Episode Length (lower = faster completion)")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("training_curve.png", dpi=150)
plt.show()

print(f"\nFinal reward: {rewards[-1]:.2f}")
print(f"Final episode length: {ep_lens[-1]:.1f}")

## 6. Evaluate

In [None]:
# Evaluate the trained agent
from tqdm import tqdm

def evaluate(algo, num_episodes=100):
    env = MiniGridRLlibEnv({"env_name": "BabyAI-GoToObj-v0", "max_steps": 64})
    successes = 0
    total_reward = 0
    total_steps = 0
    
    for ep in tqdm(range(num_episodes), desc="Evaluating"):
        obs, info = env.reset(seed=1000 + ep)  # Different seeds from training
        done = False
        ep_reward = 0
        steps = 0
        
        while not done:
            action = algo.compute_single_action(obs)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            ep_reward += reward
            steps += 1
        
        if terminated and ep_reward > 0:
            successes += 1
        total_reward += ep_reward
        total_steps += steps
    
    return {
        "success_rate": successes / num_episodes,
        "mean_reward": total_reward / num_episodes,
        "mean_steps": total_steps / num_episodes,
    }

print("\nðŸ“Š Evaluating trained agent...\n")
eval_results = evaluate(algo, num_episodes=100)

print(f"\n{'='*40}")
print(f"ðŸ“Š EVALUATION RESULTS")
print(f"{'='*40}")
print(f"Success Rate: {eval_results['success_rate']:.1%}")
print(f"Mean Reward:  {eval_results['mean_reward']:.3f}")
print(f"Mean Steps:   {eval_results['mean_steps']:.1f}")
print(f"{'='*40}")

## 7. Save & Download

In [None]:
# Save final checkpoint
import json

final_checkpoint = algo.save("experiments/checkpoints/final")
print(f"ðŸ’¾ Final checkpoint saved: {final_checkpoint}")

# Save evaluation results
with open("eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)

# Save training results
with open("training_results.json", "w") as f:
    json.dump(results, f, indent=2)

# Zip for download
!zip -r trained_model.zip experiments/checkpoints/ eval_results.json training_results.json
print("\nðŸ“¦ Model zipped: trained_model.zip")

In [None]:
# Download files
from google.colab import files

print("Downloading files...")
files.download('trained_model.zip')
files.download('training_curve.png')
print("\nâœ… Download complete!")

In [None]:
# Cleanup
algo.stop()
ray.shutdown()
print("\nâœ… Done! Your trained model is in trained_model.zip")