# ðŸ¤– Language-Conditioned RL Agent Training

## Setup
1. **Runtime** â†’ **Change runtime type** â†’ **T4 GPU**
2. Run all cells in order
3. Download trained model at the end

**Time**: ~45-90 minutes for 200 iterations

In [None]:
!pip install minigrid gymnasium numpy torch tqdm matplotlib -q
!pip install 'ray[rllib]' -q
print("âœ… Dependencies installed!")
print("âœ… Dependencies installed!")

In [None]:
# Create environment module file (REQUIRED for workers)
import os
os.makedirs('minigrid_env', exist_ok=True)

env_code = '''
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import minigrid

class MiniGridFlatEnv(gym.Env):
    def __init__(self, config=None):
        super().__init__()
        config = config or {}
        env_name = config.get("env_name", "BabyAI-GoToObj-v0")
        max_steps = config.get("max_steps", 64)
        self.env = gym.make(env_name, render_mode="rgb_array")
        self.env.unwrapped.max_steps = max_steps
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(151,), dtype=np.float32)
        self.action_space = self.env.action_space
        self.instruction = ""
    
    def _flatten_obs(self, obs):
        image = obs["image"].flatten().astype(np.float32) / 10.0
        direction = np.zeros(4, dtype=np.float32)
        direction[obs["direction"]] = 1.0
        return np.concatenate([image, direction])
    
    def reset(self, *, seed=None, options=None):
        obs, info = self.env.reset(seed=seed, options=options)
        self.instruction = self.env.unwrapped.mission
        return self._flatten_obs(obs), info
    
    def step(self, action):
        obs, reward, term, trunc, info = self.env.step(action)
        return self._flatten_obs(obs), reward, term, trunc, info
'''

with open('minigrid_env/__init__.py', 'w') as f:
    f.write('from minigrid_env.flat_env import MiniGridFlatEnv\n')
with open('minigrid_env/flat_env.py', 'w') as f:
    f.write(env_code)
print("âœ… Environment module created!")

In [None]:
# Verify environment
import sys
sys.path.insert(0, '.')
from minigrid_env import MiniGridFlatEnv

env = MiniGridFlatEnv({"env_name": "BabyAI-GoToObj-v0"})
obs, _ = env.reset(seed=42)
print(f"Obs shape: {obs.shape}, Instruction: {env.instruction}")
print("âœ… Environment works!")

In [None]:
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.registry import register_env
import torch

if ray.is_initialized():
    ray.shutdown()
ray.init(ignore_reinit_error=True)

def env_creator(config):
    from minigrid_env import MiniGridFlatEnv
    return MiniGridFlatEnv(config)

register_env("MiniGridFlat-v0", env_creator)

print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("âœ… Ray initialized!")

In [None]:
# PPO Config - OLD API for stability
config = (
    PPOConfig()
    .api_stack(
        enable_rl_module_and_learner=False,
        enable_env_runner_and_connector_v2=False,
    )
    .environment(
        env="MiniGridFlat-v0",
        env_config={"env_name": "BabyAI-GoToObj-v0", "max_steps": 64},
    )
    .framework("torch")
    .env_runners(
        num_env_runners=2,
        num_envs_per_env_runner=4,
    )
    .training(
        train_batch_size=2048,
        sgd_minibatch_size=256,
        lr=3e-4,
        gamma=0.99,
        clip_param=0.2,
        num_sgd_iter=10,
        entropy_coeff=0.01,
        model={"fcnet_hiddens": [256, 256], "fcnet_activation": "relu"},
    )
    .resources(num_gpus=1 if torch.cuda.is_available() else 0)
)
print("âœ… Config created!")

In [None]:
print("Building PPO...")
algo = config.build()
print("âœ… PPO built!")

In [None]:
import os
os.makedirs('checkpoints', exist_ok=True)

NUM_ITERATIONS = 200
results = []

print(f"Training for {NUM_ITERATIONS} iterations...")
print("="*50)

for i in range(NUM_ITERATIONS):
    result = algo.train()
    reward = result.get("episode_reward_mean", 0) or 0
    ep_len = result.get("episode_len_mean", 0) or 0
    results.append({"iter": i+1, "reward": reward, "len": ep_len})
    
    if (i+1) % 10 == 0:
        print(f"Iter {i+1:3d}: reward={reward:7.3f}, len={ep_len:5.1f}")
    if (i+1) % 50 == 0:
        algo.save("checkpoints")
        print("  ðŸ’¾ Saved")

print("="*50)
print("âœ… Training complete!")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

rewards = [r["reward"] for r in results]
plt.figure(figsize=(10, 4))
plt.plot(rewards, alpha=0.5)
if len(rewards) > 10:
    smooth = np.convolve(rewards, np.ones(10)/10, mode='valid')
    plt.plot(range(5, len(rewards)-4), smooth, 'r-', lw=2)
plt.xlabel("Iteration")
plt.ylabel("Reward")
plt.title("Training Curve")
plt.grid(True, alpha=0.3)
plt.savefig("training_curve.png")
plt.show()
print(f"Final reward: {rewards[-1]:.3f}")

In [None]:
from tqdm import tqdm
from minigrid_env import MiniGridFlatEnv

env = MiniGridFlatEnv({"env_name": "BabyAI-GoToObj-v0", "max_steps": 64})
successes = 0
for ep in tqdm(range(100), desc="Evaluating"):
    obs, _ = env.reset(seed=1000+ep)
    done = False
    while not done:
        action = algo.compute_single_action(obs)
        obs, reward, term, trunc, _ = env.step(action)
        done = term or trunc
    if term and reward > 0:
        successes += 1

print(f"\nðŸ“Š Success Rate: {successes}%")

In [None]:
import json
algo.save("checkpoints/final")
with open("results.json", "w") as f:
    json.dump(results, f)
!zip -r trained_model.zip checkpoints/ results.json minigrid_env/
print("ðŸ“¦ Created trained_model.zip")

In [None]:
from google.colab import files
files.download('trained_model.zip')
files.download('training_curve.png')

In [None]:
algo.stop()
ray.shutdown()
print("âœ… Done!")