In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define neural network architecture
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

In [None]:
# MC REINFORCE with and without baseline
class ReinforceAgent:
    def __init__(self, env, baseline=False, gamma=0.99, lr=0.01, hidden_dim=64, num_runs=5, num_episodes=1000):
        self.env = env
        self.baseline = baseline
        self.policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, hidden_dim)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)
        self.gamma = gamma
        self.num_runs = num_runs
        self.num_episodes = num_episodes

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action_probs = self.policy_network(state)
        action = torch.multinomial(action_probs, 1).item()
        return action

    def update(self, episode):
        log_probs = []
        rewards = []
        state = self.env.reset()

        # Collect trajectory
        while True:
            action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action)
            log_probs.append(torch.log(self.policy_network(torch.FloatTensor(state)).squeeze(0)[action]))
            rewards.append(reward)
            state = next_state
            if done:
                break

        # Calculate returns
        returns = []
        G = 0
        for r in rewards[::-1]:
            G = r + self.gamma * G
            returns.insert(0, G)

        # Update policy parameters
        self.optimizer.zero_grad()
        for i, log_prob in enumerate(log_probs):
            if self.baseline:
                V_s = torch.sum(torch.FloatTensor(returns[i:]))
                loss = -log_prob * (returns[i] - V_s)
            else:
                loss = -log_prob * returns[i]
            loss.backward()
        self.optimizer.step()

In [None]:
# Function to run experiments and plot results
def run_experiment(env_name, baseline=False, gamma=0.99, lr=0.01, hidden_dim=64, num_runs=5, num_episodes=1000):
    env = gym.make(env_name)
    agent = ReinforceAgent(env, baseline=baseline, gamma=gamma, lr=lr, hidden_dim=hidden_dim, num_runs=num_runs, num_episodes=num_episodes)
    returns = []

    for run in range(num_runs):
        print(f"Experiment Run: {run+1}")
        episode_returns = []
        for episode in tqdm(range(num_episodes)):
            agent.update(episode)
            total_reward = 0
            state = env.reset()
            while True:
                action = agent.select_action(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                state = next_state
                if done:
                    break
            episode_returns.append(total_reward)
        returns.append(episode_returns)

    returns = np.array(returns)
    mean_returns = np.mean(returns, axis=0)
    std_returns = np.std(returns, axis=0)

    return mean_returns, std_returns

In [None]:
hyperparameters = {
        'gamma': 0.99,
        'lr': 0.001,
        'hidden_dim': 64,
        'num_runs': 5,
        'num_episodes': 500
    }
methods = ['Without Baseline', 'With Baseline']

In [None]:
env_name = 'Acrobot-v1'
fig, ax = plt.subplots()
for i, method in enumerate(methods):
    print(f"Training on {env_name} {method}".upper())
    mean_returns, std_returns = run_experiment(env_name, baseline=(i == 1), **hyperparameters)
    ax.plot(mean_returns, label=method)
    ax.fill_between(range(len(mean_returns)), mean_returns - std_returns, mean_returns + std_returns, alpha=0.3)

ax.set_title(f'Monte Carlo REINFORCE - {env_name}')
ax.set_xlabel('Episodes')
ax.set_ylabel('Episodic Returns')
ax.legend()
plt.show()

In [None]:
env_name = 'CartPole-v1'
fig, ax = plt.subplots()
for i, method in enumerate(methods):
    print(f"Training on {env_name} {method}".upper())
    mean_returns, std_returns = run_experiment(env_name, baseline=(i == 1), **hyperparameters)
    ax.plot(mean_returns, label=method)
    ax.fill_between(range(len(mean_returns)), mean_returns - std_returns, mean_returns + std_returns, alpha=0.3)

ax.set_title(f'Monte Carlo REINFORCE - {env_name}')
ax.set_xlabel('Episodes')
ax.set_ylabel('Episodic Returns')
ax.legend()
plt.show()