## Imitation Learning (Behavior Cloning)

In [38]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import random


# Define the environment and expert policy
env = gym.make("MountainCar-v0")


# Function to collect expert trajectories
def collect_expert_data(env, policy, num_episodes=100):
    states = []
    for _ in range(num_episodes):
        state, _ = env.reset()  # Gymnasium reset returns (state, info)
        done = False
        while not done:
            action = policy(state)
            states.append(state)
            state, _, done, _, _ = env.step(action)  # Gymnasium step returns (state, reward, done, truncated, info)
    return np.array(states)




## State visitation histogram under the expert policy

In [None]:

expert_policy = lambda state: 2 if state[1] > 0 else 0  # Example expert policy

# Collect 100 trajectories using the expert policy
states = collect_expert_data(env, expert_policy)

# Calculate the number of transition data points
num_data_points = states.shape[0]

# Plot the histogram of state values and density contours
x, y = states[:, 0], states[:, 1]

# Create a density plot
kde = gaussian_kde([x, y])
xx, yy = np.meshgrid(np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100))
zz = kde(np.vstack([xx.ravel(), yy.ravel()])).reshape(xx.shape)

# Plot the density contour
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, zz, levels=50, cmap="Blues")
plt.colorbar(label="Density")
plt.scatter(x, y, s=1, color="red", label="State Samples")
plt.xlabel("Position")
plt.ylabel("Velocity")
plt.title("Density Contour of Expert Policy Trajectory")
plt.legend()
plt.grid(True)
plt.show()

print(f"the number of data points is: {num_data_points}")


## State visitation histogram under a noisy version of expert policy

In [None]:

# Define the noisy expert policy
epsilon = 0.2  # Probability of choosing a random action
actions = [0, 1, 2]  # Possible actions: 0 (left), 1 (no push), 2 (right)

noisy_expert_policy = lambda state: random.choice(actions) if random.random() < epsilon else (2 if state[1] > 0 else 0)

# Collect trajectories using the noisy expert policy
states_noisy = collect_expert_data(env, noisy_expert_policy)

# Calculate the number of transition data points for the noisy policy
num_data_points_noisy = states_noisy.shape[0]

# Extract position and velocity
x_noisy, y_noisy = states_noisy[:, 0], states_noisy[:, 1]

# Create a density plot for the noisy policy
kde_noisy = gaussian_kde([x_noisy, y_noisy])
zz_noisy = kde_noisy(np.vstack([xx.ravel(), yy.ravel()])).reshape(xx.shape)

# Plot the density contour for the noisy expert policy
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, zz_noisy, levels=50, cmap="Blues")
plt.colorbar(label="Density (Noisy Expert Policy)")
plt.scatter(x_noisy, y_noisy, s=1, color="red", label="State Samples (Noisy)")
plt.xlabel("Position")
plt.ylabel("Velocity")
plt.title("Density Contour of Noisy Expert Policy Trajectory")
plt.legend()
plt.grid(True)
plt.show()

print(f"the number of data points is: {num_data_points_noisy}")


# Now let's collect data and train a policy using behavior cloning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import matplotlib.pyplot as plt
import numpy as np

# Verify GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset class
class ExpertDataset(Dataset):
    def __init__(self, states, actions):
        self.states = torch.tensor(states, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.long)

    def __len__(self):
        return len(self.states)

    def __getitem__(self, idx):
        return self.states[idx], self.actions[idx]

# Collect expert data
def collect_expert_data_state_action(env, policy, num_episodes=10):
    states, actions = [], []
    max_steps = 1000  # Truncate episodes after 1000 steps
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        step = 0
        while not done and step < max_steps:
            action = policy(state)
            states.append(state)
            actions.append(action)
            state, _, done, _, _ = env.step(action)
            step += 1
    return np.array(states), np.array(actions)
 


### Collect a dataset and split into training and validation datasets

In [42]:


states, actions = collect_expert_data_state_action(env, expert_policy, num_episodes=20)

# Create a dataset and dataloaders
dataset = ExpertDataset(states, actions)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


### Define policy network

In [43]:

# Define a policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        return self.net(x)

state_dim = states.shape[1]
action_dim = 3  # For MountainCar-v0, actions are 0, 1, 2
policy_net = PolicyNetwork(state_dim, action_dim).to(device)




### Define loss function and optimization method

In [None]:

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)

# Training loop
num_epochs = 30
train_losses = []
val_losses = []
for epoch in range(num_epochs):
    policy_net.train()
    train_loss = 0
    for states_batch, actions_batch in train_loader:
        states_batch, actions_batch = states_batch.to(device), actions_batch.to(device)
        assert torch.max(actions_batch) < action_dim, "Target out of bounds for the output layer"
        assert torch.min(actions_batch) >= 0, "Target contains negative values"
        optimizer.zero_grad()
        outputs = policy_net(states_batch)
        

        loss = criterion(outputs, actions_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_loader))

    policy_net.eval()
    val_loss = 0
    with torch.no_grad():
        for states_batch, actions_batch in val_loader:
            states_batch, actions_batch = states_batch.to(device), actions_batch.to(device)
            outputs = policy_net(states_batch)
            loss = criterion(outputs, actions_batch)
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_loader))

    # Print losses every few epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:  # Print every 5 epochs and the first epoch
        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"  Training Loss: {train_losses[-1]:.4f}")
        print(f"  Validation Loss: {val_losses[-1]:.4f}")


In [None]:

# Plot training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curves")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

 

# Define learned policy and expert policy functions
learned_policy = lambda state: torch.argmax(policy_net(torch.tensor(state, dtype=torch.float32).to(device))).item()



# Adjusted evaluation function
def evaluate_policy(env, policy, num_episodes=20):
    total_rewards = []
    max_steps = 1000  # Add a step limit to prevent infinite loops
    for _ in range(num_episodes):
        state, _ = env.reset()  # Reset the environment
        done = False
        total_reward = 0
        step = 0  # Step counter
        while not done and step < max_steps:  # Add a step limit
            action = policy(state)
            state, reward, done, _, _ = env.step(action)
            total_reward += reward
            step += 1
        total_rewards.append(total_reward)
    return total_rewards

# Evaluate the expert policy and learned policy
expert_rewards = evaluate_policy(env, expert_policy, num_episodes=20)
learned_rewards = evaluate_policy(env, learned_policy, num_episodes=20)

# Plot histograms of rewards
plt.figure(figsize=(10, 6))
plt.hist(expert_rewards, bins=10, alpha=0.7, label="Expert Policy", edgecolor='black')
plt.hist(learned_rewards, bins=10, alpha=0.7, label="Learned Policy", edgecolor='black')
plt.xlabel("Total Rewards")
plt.ylabel("Frequency")
plt.title("Histogram of Rewards for Expert and Learned Policies")
plt.legend()
plt.grid(True)
plt.show()

# Calculate and print the mean rewards
mean_expert_rewards = np.mean(expert_rewards)
mean_learned_rewards = np.mean(learned_rewards)

print(f"Mean Total Reward:\n"
      f"Expert Policy: {mean_expert_rewards:.2f}\n"
      f"Learned Policy: {mean_learned_rewards:.2f}")


## Let's run the same pipeline on noisy version of expert policy and see if it is better

In [47]:


states, actions = collect_expert_data_state_action(env, noisy_expert_policy, num_episodes=50)

# Create a dataset and dataloaders
dataset = ExpertDataset(states, actions)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [None]:
state_dim = states.shape[1]
action_dim = 3  # For MountainCar-v0, actions are 0, 1, 2
policy_net = PolicyNetwork(state_dim, action_dim).to(device)


# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)

# Training loop
num_epochs = 30
train_losses = []
val_losses = []
for epoch in range(num_epochs):
    policy_net.train()
    train_loss = 0
    for states_batch, actions_batch in train_loader:
        states_batch, actions_batch = states_batch.to(device), actions_batch.to(device)
        assert torch.max(actions_batch) < action_dim, "Target out of bounds for the output layer"
        assert torch.min(actions_batch) >= 0, "Target contains negative values"
        optimizer.zero_grad()
        outputs = policy_net(states_batch)
        

        loss = criterion(outputs, actions_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_loader))

    policy_net.eval()
    val_loss = 0
    with torch.no_grad():
        for states_batch, actions_batch in val_loader:
            states_batch, actions_batch = states_batch.to(device), actions_batch.to(device)
            outputs = policy_net(states_batch)
            loss = criterion(outputs, actions_batch)
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_loader))

    # Print losses every few epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:  # Print every 5 epochs and the first epoch
        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"  Training Loss: {train_losses[-1]:.4f}")
        print(f"  Validation Loss: {val_losses[-1]:.4f}")



In [None]:

# Plot training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curves")
plt.legend()
plt.grid(True)
plt.show()
 


In [None]:


# Define learned policy and expert policy functions
learned_policy = lambda state: torch.argmax(policy_net(torch.tensor(state, dtype=torch.float32).to(device))).item()

# Evaluate the noisy expert policy
noisy_expert_rewards = evaluate_policy(env, noisy_expert_policy, num_episodes=20)

# Plot histograms of rewards for expert, learned, and noisy expert policies
plt.figure(figsize=(10, 6))
plt.hist(expert_rewards, bins=10, alpha=0.7, label="Expert Policy", edgecolor='black')
plt.hist(learned_rewards, bins=10, alpha=0.7, label="Learned Policy", edgecolor='black')
plt.hist(noisy_expert_rewards, bins=10, alpha=0.7, label="Noisy Expert Policy", edgecolor='black')
plt.xlabel("Total Rewards")
plt.ylabel("Frequency")
plt.title("Histogram of Rewards for Expert, Learned, and Noisy Expert Policies")
plt.legend()
plt.grid(True)
plt.show()

# Calculate and print the mean rewards
mean_noisy_expert_rewards = np.mean(noisy_expert_rewards)

print(f"Mean Total Reward:\n"
      f"Expert Policy: {mean_expert_rewards:.2f}\n"
      f"Learned Policy: {mean_learned_rewards:.2f}\n"
      f"Noisy Expert Policy: {mean_noisy_expert_rewards:.2f}")


## Let's try Dagger. 

In the previous experiment, we collected 20 episodes of expert data. Now let's collect only 10 episodes, and use the previously learned policy to collect another 10 episodes of data, and let the expert provide the label. Then we use this aggregated dataset (**still 20 episodes**) and train a policy from scratch 

In [51]:
# Step 1: Collect 10 episodes of expert data
expert_states, expert_actions = collect_expert_data_state_action(env, expert_policy, num_episodes=10)

# Step 2: Collect 10 episodes using the learned policy, with expert providing labels
learned_states, _ = collect_expert_data_state_action(env, learned_policy, num_episodes=10)
# Query the expert for the corresponding actions for the states collected by the learned policy
learned_actions = np.array([expert_policy(state) for state in learned_states])

# Step 3: Combine the two datasets
dagger_states = np.vstack([expert_states, learned_states])
dagger_actions = np.hstack([expert_actions, learned_actions])

# Create a new dataset for fine-tuning
dagger_dataset = ExpertDataset(dagger_states, dagger_actions)
dagger_loader = DataLoader(dagger_dataset, batch_size=64, shuffle=True)


In [None]:
state_dim = states.shape[1]
action_dim = 3  # For MountainCar-v0, actions are 0, 1, 2
policy_net = PolicyNetwork(state_dim, action_dim).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)

# Step 1: Finetune the policy network using the aggregated dataset (DAgger)
dagger_epochs = 30
dagger_losses = []

for epoch in range(dagger_epochs):
    policy_net.train()
    dagger_loss = 0
    for states_batch, actions_batch in dagger_loader:
        states_batch, actions_batch = states_batch.to(device), actions_batch.to(device)
        optimizer.zero_grad()
        outputs = policy_net(states_batch)
        loss = criterion(outputs, actions_batch)
        loss.backward()
        optimizer.step()
        dagger_loss += loss.item()
    dagger_losses.append(dagger_loss / len(dagger_loader))

# Step 2: Plot the DAgger training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, dagger_epochs + 1), dagger_losses, label="DAgger Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss with DAgger")
plt.legend()
plt.grid(True)
plt.show()

# Step 3: Evaluate the DAgger-trained policy
dagger_policy = lambda state: torch.argmax(policy_net(torch.tensor(state, dtype=torch.float32).to(device))).item()
dagger_policy_rewards = evaluate_policy(env, dagger_policy, num_episodes=20)

# Step 4: Plot histograms of rewards for Expert, Learned, and DAgger policies
plt.figure(figsize=(10, 6))
plt.hist(expert_rewards, bins=10, alpha=0.7, label="Expert Policy", edgecolor='black')
plt.hist(learned_rewards, bins=10, alpha=0.7, label="Learned Policy", edgecolor='black')
plt.hist(dagger_policy_rewards, bins=10, alpha=0.7, label="DAgger Policy", edgecolor='black')
plt.xlabel("Total Rewards")
plt.ylabel("Frequency")
plt.title("Histogram of Rewards for Expert, Learned, and DAgger Policies")
plt.legend()
plt.grid(True)
plt.show()

# Step 5: Print the performance comparison
mean_expert_rewards = np.mean(expert_rewards)
mean_learned_rewards = np.mean(learned_rewards)
mean_dagger_rewards = np.mean(dagger_policy_rewards)

print(f"Performance Comparison:\n"
      f"-----------------------\n"
      f"Expert Policy Performance: {mean_expert_rewards:.2f}\n"
      f"Initial Learned Policy Performance: {mean_learned_rewards:.2f}\n"
      f"DAgger Policy Performance: {mean_dagger_rewards:.2f}\n")
