In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from typing import List, Callable, Dict, Any

class MAMLWrapper:
    def __init__(
        self, 
        env_generator: Callable[[], gym.Env], 
        num_tasks: int = 5, 
        meta_lr: float = 1e-3, 
        inner_lr: float = 1e-4, 
        inner_steps: int = 1
    ):
        """
        Model-Agnostic Meta-Learning (MAML) Wrapper
        
        Args:
            env_generator: Function that generates different gym environments
            num_tasks: Number of tasks for meta-training
            meta_lr: Learning rate for meta-optimization
            inner_lr: Learning rate for task-specific adaptation
            inner_steps: Number of inner loop gradient steps
        """
        self.env_generator = env_generator
        self.num_tasks = num_tasks
        self.meta_lr = meta_lr
        self.inner_lr = inner_lr
        self.inner_steps = inner_steps
        
        # Generate tasks
        self.tasks = [self.env_generator() for _ in range(num_tasks)]
        
        # Initialize base PPO policy
        self.base_policy = self._create_base_policy()
    
    def _create_base_policy(self) -> PPO:
        """
        Create a base PPO policy that can be adapted across tasks
        
        Returns:
            Initialized PPO policy
        """
        # Use the first environment to set up base policy
        base_env = self.tasks[0]
        vec_env = DummyVecEnv([lambda: base_env])
        
        return PPO(
            "MlpPolicy", 
            vec_env, 
            verbose=1,
            learning_rate=self.meta_lr
        )
    
    def _clone_policy(self, policy: PPO) -> PPO:
        """
        Create a deep copy of the policy for task-specific adaptation
        
        Args:
            policy: Original PPO policy
        
        Returns:
            Cloned PPO policy
        """
        cloned_policy = PPO(
            "MlpPolicy", 
            policy.env, 
            verbose=0,
            learning_rate=self.inner_lr
        )
        
        # Copy network weights
        for param, cloned_param in zip(
            policy.policy.parameters(), 
            cloned_policy.policy.parameters()
        ):
            cloned_param.data.copy_(param.data)
        
        return cloned_policy
    
    def meta_train(self, total_timesteps: int = 10000):
        """
        Perform meta-training across multiple tasks
        
        Args:
            total_timesteps: Total training steps across all tasks
        """
        # Track meta-parameters
        meta_params = list(self.base_policy.policy.parameters())
        meta_optimizer = optim.Adam(meta_params, lr=self.meta_lr)
        
        for task_idx, task_env in enumerate(self.tasks):
            # Clone base policy for task-specific adaptation
            task_policy = self._clone_policy(self.base_policy)
            
            # Inner loop: Task-specific adaptation
            for _ in range(self.inner_steps):
                task_policy.learn(total_timesteps // (self.num_tasks * self.inner_steps), reset_num_timesteps=False)
            
            # Compute meta-loss (performance difference)
            vec_task_env = DummyVecEnv([lambda: task_env])
            task_policy.set_env(vec_task_env)
            meta_loss = -task_policy.learn(total_timesteps // self.num_tasks)
            
            # Meta-update
            meta_optimizer.zero_grad()
            meta_loss.backward()
            meta_optimizer.step()
            
            print(f"Completed meta-training for Task {task_idx + 1}")
        
        return self.base_policy
    
    def adapt(self, env: gym.Env, adaptation_steps: int = 1000) -> PPO:
        """
        Adapt base policy to a specific environment
        
        Args:
            env: Target environment
            adaptation_steps: Number of steps for adaptation
        
        Returns:
            Task-adapted PPO policy
        """
        adapted_policy = self._clone_policy(self.base_policy)
        vec_env = DummyVecEnv([lambda: env])
        
        # Perform task-specific adaptation
        adapted_policy.learn(adaptation_steps)
        
        return adapted_policy

# Example Usage with Different Gym Environments
def create_car_env():
    """Create a CarRacing-v2 environment"""
    return gym.make('CarRacing-v2')

def create_lunar_env():
    """Create a LunarLander-v2 environment"""
    return gym.make('LunarLander-v2')

# Demonstration of MAML with diverse environments
def main():
    # Define environment generator with different observation spaces
    env_generators = [create_car_env, create_lunar_env]
    
    # Initialize MAML with multiple environment types
    maml = MAMLWrapper(
        env_generator=lambda: env_generators[np.random.randint(len(env_generators))](), 
        num_tasks=2,
        meta_lr=1e-3,
        inner_lr=1e-4,
        inner_steps=2
    )
    
    # Meta-train the policy
    base_policy = maml.meta_train(total_timesteps=20000)
    
    # Adapt to a specific environment
    test_env = create_lunar_env()
    adapted_policy = maml.adapt(test_env, adaptation_steps=5000)
    
    # Evaluate the adapted policy
    vec_test_env = DummyVecEnv([lambda: test_env])
    adapted_policy.set_env(vec_test_env)
    adapted_policy.learn(10000)

if __name__ == "__main__":
    main()

DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`