# exploration.py

Auto-generated implementation from the Agentic RL PhD codebase.

### Original Implementations & References
The following links point to the official or high-quality reference implementations for the papers covered in this notebook:

- https://github.com/pathak22/noreward-rl (ICM), https://github.com/openai/random-network-distillation (RND)

*Note: The code below is a simplified pedagogical implementation.*

In [None]:
import torch
import torch.nn as nn

# Papers:
# 1. "Curiosity-driven Exploration by Self-supervised Prediction" (ICM)
# 2. "Exploration by Random Network Distillation" (RND)
# 3. "Efficient Imitation Learning with Double Exploration" (ILDE)

class ICM(nn.Module):
    """
    Paper: Intrinsic Curiosity Module (Pathak et al., 2017)
    Innovation: Reward = Prediction Error of Inverse/Forward Dynamics.
    """
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.encoder = nn.Linear(state_dim, 128)
        
        # Forward Model: predict next state feature
        self.forward_model = nn.Sequential(
            nn.Linear(128 + action_dim, 128),
            nn.Linear(128, 128) # Predicts phi(s_next)
        )
        
        # Inverse Model: predict action taken
        self.inverse_model = nn.Sequential(
            nn.Linear(128 + 128, 128),
            nn.Linear(128, action_dim)
        )

    def intrinsic_reward(self, state, next_state, action):
        phi_s = self.encoder(state)
        phi_next = self.encoder(next_state)
        
        # Predict next state features
        pred_phi_next = self.forward_model(torch.cat([phi_s, action], dim=1))
        
        # Reward is MSE betweeen predicted and actual features
        reward = (pred_phi_next - phi_next).pow(2).mean(dim=1)
        return reward

class RND(nn.Module):
    """
    Paper: Random Network Distillation (Burda et al., 2018)
    Innovation: Reward = Error in predicting output of a fixed random network.
    """
    def __init__(self, state_dim):
        super().__init__()
        # Target Network (Fixed, Random weights)
        self.target = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(),
            nn.Linear(128, 128)
        )
        for p in self.target.parameters():
            p.requires_grad = False # Fixed
            
        # Predictor Network (Trained to match Target)
        self.predictor = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(),
            nn.Linear(128, 128)
        )
        
    def intrinsic_reward(self, state):
        target_out = self.target(state)
        pred_out = self.predictor(state)
        
        # Reward = Prediction Error
        # Novel states -> High error -> High Reward
        reward = (target_out - pred_out).pow(2).mean(dim=1)
        return reward

class ILDE(nn.Module):
    """
    Paper: Beyond-Expert Performance... (2025)
    Innovation: Double Exploration (Optimistic + Curiosity)
    """
    def __init__(self):
        super().__init__()
        # Combines an exploration bonus (like RND) with an imitation objective
        pass
