In [None]:
!pip install torch torch.nn torch.optim
!pip install gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement torch.nn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch.nn[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Definition of the policy model
class Policy(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# A function to execute the PPO algorithm
def ppo(env, num_episodes, max_steps, learning_rate, clip_epsilon):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    policy = Policy(state_dim, action_dim)
    optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    
    for episode in range(num_episodes):
        state = env.reset()
        rewards = []
        log_probs = []
        values = []
        
        for step in range(max_steps):
            state_tensor = torch.from_numpy(state).float().unsqueeze(0)
            action_probs = torch.softmax(policy(state_tensor), dim=1)
            dist = Categorical(action_probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = policy(state_tensor).max(1)[0]

            next_state, reward, done, _ = env.step(action.item())

            rewards.append(reward)
            log_probs.append(log_prob)
            values.append(value)

            state = next_state

            if done:
                break

        returns = calculate_returns(rewards)
        advantages = calculate_advantages(rewards, values)

        actor_loss, critic_loss = update_policy(policy, optimizer, returns, log_probs, values, advantages, clip_epsilon)

        print(f"Episode {episode + 1}: Actor Loss: {actor_loss}, Critic Loss: {critic_loss}")

# A function to calculate returns
def calculate_returns(rewards, discount_factor=0.99):
    returns = []
    discounted_reward = torch.tensor(0.0)
    
    for r in reversed(rewards):
        discounted_reward = r + discount_factor * discounted_reward
        returns.insert(0, discounted_reward)
    
    return returns

# A function to calculate advantages
def calculate_advantages(rewards, values, discount_factor=0.99, gae_lambda=0.95):
    advantages = []
    td_errors = []

    for i in range(len(rewards) - 1): 
        td_error = rewards[i] + discount_factor * values[i + 1] - values[i]
        td_errors.append(td_error)

    gae = 0
    for i in reversed(range(len(td_errors))):
        gae = td_errors[i] + discount_factor * gae_lambda * gae
        advantages.insert(0, gae)

    return advantages

# A function to update the policy model
def update_policy(policy, optimizer, returns, log_probs, values, advantages, clip_epsilon):
    actor_loss = 0
    critic_loss = 0
    
    for i in range(len(returns)- 1):
        advantage = advantages[i]
        return_ = returns[i]
        log_prob = log_probs[i]
        value = values[i]

        ratio = torch.exp(log_prob - log_probs[i])
        surrogate1 = ratio * advantage
        surrogate2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
        actor_loss -= torch.min(surrogate1, surrogate2).mean()
        critic_loss += nn.MSELoss()(value, return_)

    optimizer.zero_grad()
    loss = actor_loss + critic_loss
    loss.backward()
    optimizer.step()

    return actor_loss.item(), critic_loss.item()


In [None]:
# Build the environment
env = gym.make('CartPole-v1')
# Set parameters
num_episodes = 1000
max_steps = 200
learning_rate = 0.001
clip_epsilon = 0.2
# Implementation of the PPO algorithm
ppo(env, num_episodes, max_steps, learning_rate, clip_epsilon)

Episode 1: Actor Loss: -326.3397216796875, Critic Loss: 10728.4794921875
Episode 2: Actor Loss: -358.7444763183594, Critic Loss: 12456.3232421875
Episode 3: Actor Loss: -164.85311889648438, Critic Loss: 3219.8681640625
Episode 4: Actor Loss: -68.06311798095703, Critic Loss: 722.6948852539062
Episode 5: Actor Loss: -571.7117919921875, Critic Loss: 30044.025390625
Episode 6: Actor Loss: -164.46051025390625, Critic Loss: 3169.760986328125
Episode 7: Actor Loss: -168.4051513671875, Critic Loss: 3212.189208984375
Episode 8: Actor Loss: -125.94105529785156, Critic Loss: 2082.86962890625
Episode 9: Actor Loss: -92.25568389892578, Critic Loss: 1291.636474609375
Episode 10: Actor Loss: -226.39810180664062, Critic Loss: 5629.7802734375
Episode 11: Actor Loss: -78.27005767822266, Critic Loss: 888.5401000976562
Episode 12: Actor Loss: -40.3082275390625, Critic Loss: 316.61163330078125
Episode 13: Actor Loss: -173.802734375, Critic Loss: 3627.556884765625
Episode 14: Actor Loss: -796.4843139648438,