<a href="https://colab.research.google.com/github/akhil4674/Imitation-Learning/blob/main/Nvidia_GTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# **Assumptions**:
# - States and actions are continuous and normalized.
# - Expert trajectories are available as `(state, action, next_state)` tuples.

### 1. Dataset Class for Expert Trajectories
class ExpertTrajectoryDataset(Dataset):
    def __init__(self, states, actions, next_states):
        self.states = torch.tensor(states, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.float32)
        self.next_states = torch.tensor(next_states, dtype=torch.float32)

    def __len__(self):
        return len(self.states)

    def __getitem__(self, idx):
        return self.states[idx], self.actions[idx], self.next_states[idx]

### 2. Forward (Transition) Model
class TransitionModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(TransitionModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, state_dim)
        )

    def forward(self, state_action):
        return self.model(state_action)

### 3. Reverse (Policy) Model
class PolicyModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(PolicyModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, state):
        return self.model(state)

### 4. Training
def train(models, device, loader, optimizers, epochs=100):
    transition_model, policy_model = models
    transition_optimizer, policy_optimizer = optimizers

    for epoch in range(epochs):
        for batch in loader:
            states, actions, next_states = [b.to(device) for b in batch]

            # Forward (Transition) Model Training
            transition_optimizer.zero_grad()
            pred_next_states = transition_model(torch.cat((states, actions), dim=1))
            loss_transition = ((pred_next_states - next_states) ** 2).mean()
            loss_transition.backward()
            transition_optimizer.step()

            # Reverse (Policy) Model Training
            # **Simplified Objective**: Minimize the difference between predicted actions and actual actions.
            policy_optimizer.zero_grad()
            pred_actions = policy_model(states)
            loss_policy = ((pred_actions - actions) ** 2).mean()
            loss_policy.backward()
            policy_optimizer.step()

        print(f"Epoch {epoch+1}, Transition Loss: {loss_transition.item():.4f}, Policy Loss: {loss_policy.item():.4f}")

### Example Usage
if __name__ == "__main__":
    # **Mock Data**
    np.random.seed(0)
    states = np.random.rand(100, 5)  # Assuming 5D state space
    actions = np.random.rand(100, 3)  # Assuming 3D action space
    next_states = np.random.rand(100, 5)

    # Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = ExpertTrajectoryDataset(states, actions, next_states)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    transition_model = TransitionModel(state_dim=5, action_dim=3).to(device)

    policy_model = PolicyModel(state_dim=5, action_dim=3).to(device)

    transition_optimizer = optim.Adam(transition_model.parameters(), lr=0.001)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=0.001)

    train(models=(transition_model, policy_model),
          device=device,
          loader=loader,
          optimizers=(transition_optimizer, policy_optimizer),
          epochs=500)
### Evaluation (Simplified)
def evaluate_policy(policy_model, eval_env, device, num_episodes=100):
    total_reward = 0
    for _ in range(num_episodes):
        state = eval_env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = policy_model(torch.tensor(state, dtype=torch.float32).to(device))
            action = action.cpu().detach().numpy()
            state, reward, done, _ = eval_env.step(action)
            episode_reward += reward
        total_reward += episode_reward
    avg_reward = total_reward / num_episodes
    return avg_reward

### Deployment (Conceptual)
def deploy_policy(policy_model, deployment_env, device, episodes=1000):
    for episode in range(episodes):
        state = deployment_env.reset()
        done = False
        while not done:
            action = policy_model(torch.tensor(state, dtype=torch.float32).to(device))
            action = action.cpu().detach().numpy()
            state, _, done, _ = deployment_env.step(action)
            # Optionally, log or display the environment's response to the action
        # Optionally, log or display episode end status

### Example Usage for Evaluation and Deployment
if __name__ == "__main__":
    # Assuming eval_env and deployment_env are set up appropriately
    avg_reward = evaluate_policy(policy_model, eval_env, device)
    print(f"Average Reward in Evaluation: {avg_reward}")

    deploy_policy(policy_model, deployment_env, device)



Epoch 1, Transition Loss: 0.3361, Policy Loss: 0.2376
Epoch 2, Transition Loss: 0.4149, Policy Loss: 0.1259
Epoch 3, Transition Loss: 0.2296, Policy Loss: 0.2165
Epoch 4, Transition Loss: 0.2078, Policy Loss: 0.1064
Epoch 5, Transition Loss: 0.1527, Policy Loss: 0.0908
Epoch 6, Transition Loss: 0.1745, Policy Loss: 0.1179
Epoch 7, Transition Loss: 0.1574, Policy Loss: 0.0963
Epoch 8, Transition Loss: 0.1265, Policy Loss: 0.0722
Epoch 9, Transition Loss: 0.1014, Policy Loss: 0.0874
Epoch 10, Transition Loss: 0.1416, Policy Loss: 0.1390
Epoch 11, Transition Loss: 0.0693, Policy Loss: 0.1076
Epoch 12, Transition Loss: 0.1392, Policy Loss: 0.0734
Epoch 13, Transition Loss: 0.0914, Policy Loss: 0.1302
Epoch 14, Transition Loss: 0.0704, Policy Loss: 0.0914
Epoch 15, Transition Loss: 0.0976, Policy Loss: 0.0632
Epoch 16, Transition Loss: 0.1229, Policy Loss: 0.0577
Epoch 17, Transition Loss: 0.0680, Policy Loss: 0.0805
Epoch 18, Transition Loss: 0.1024, Policy Loss: 0.0776
Epoch 19, Transitio

NameError: name 'eval_env' is not defined