In [5]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
from tqdm import tqdm

# Preprocessing function
def preprocess(image):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 2D float array """
    image = image[35:195] # crop
    image = image[::2,::2,0] # downsample by factor of 2
    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.reshape(image.astype(np.float).ravel(), [80,80])

# Policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Training function
def train_policy_gradient(env_name, num_episodes, discount_factor, learning_rate):
    env = gym.make(env_name)
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    policy = PolicyNetwork(input_dim, output_dim)
    optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

    episode_rewards = []
    moving_average_rewards = []

    for episode in tqdm(range(num_episodes), desc="Training Progress"):
        state, _ = env.reset()
        log_probs = []
        rewards = []
        
        while True:
            state_tensor = torch.FloatTensor(state)
            action_probs = torch.softmax(policy(state_tensor), dim=0)
            dist = Categorical(action_probs)
            action = dist.sample()
            
            next_state, reward, done, _, _ = env.step(action.item())
            
            log_probs.append(dist.log_prob(action))
            rewards.append(reward)
            
            state = next_state
            
            if done:
                break
        
        episode_reward = sum(rewards)
        episode_rewards.append(episode_reward)
        
        # Calculate moving average
        if len(episode_rewards) >= 100:
            moving_avg = np.mean(episode_rewards[-100:])
            moving_average_rewards.append(moving_avg)
        
        # Calculate discounted rewards
        discounted_rewards = []
        R = 0
        for r in reversed(rewards):
            R = r + discount_factor * R
            discounted_rewards.insert(0, R)
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        
        # Normalize discounted rewards
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        # Calculate loss and update policy
        loss = []
        for log_prob, R in zip(log_probs, discounted_rewards):
            loss.append(-log_prob * R)
        loss = torch.stack(loss).sum()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (episode + 1) % 100 == 0:
            tqdm.write(f"Episode {episode + 1}, Average Reward: {np.mean(episode_rewards[-100:]):.2f}")

    return policy, episode_rewards, moving_average_rewards

# Training
env_name = "CartPole-v1"
num_episodes = 1000
discount_factor = 0.95
learning_rate = 0.01

trained_policy, episode_rewards, moving_average_rewards = train_policy_gradient(env_name, num_episodes, discount_factor, learning_rate)

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards, label='Episode Reward')
plt.plot(range(99, len(episode_rewards)), moving_average_rewards, label='Moving Average (100 episodes)')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title(f'Policy Gradient Learning Curve - {env_name}')
plt.legend()
plt.show()

# Evaluation and Visualization
def evaluate_and_visualize_policy(env_name, policy, num_episodes=5):
    env = gym.make(env_name, render_mode="human")
    rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        done = False

        while not done:
            env.render()
            state_tensor = torch.FloatTensor(state)
            action_probs = torch.softmax(policy(state_tensor), dim=0)
            action = torch.argmax(action_probs).item()
            state, reward, done, _, _ = env.step(action)
            episode_reward += reward

        rewards.append(episode_reward)
        print(f"Episode {episode + 1} Reward: {episode_reward}")

    env.close()
    return rewards

print("\nVisualizing trained policy:")
evaluation_rewards = evaluate_and_visualize_policy(env_name, trained_policy)

# Calculate mean and standard deviation
mean_reward = np.mean(evaluation_rewards)
std_reward = np.std(evaluation_rewards)
print(f"\nMean Reward: {mean_reward:.2f}")
print(f"Standard Deviation of Reward: {std_reward:.2f}")

Training Progress:  10%|█         | 101/1000 [00:05<00:45, 19.61it/s]

Episode 100, Average Reward: 58.54


Training Progress:  20%|██        | 201/1000 [00:27<01:09, 11.50it/s]

Episode 200, Average Reward: 185.37


Training Progress:  22%|██▏       | 221/1000 [01:20<04:43,  2.75it/s]


KeyboardInterrupt: 