In [4]:
!pip install stable-baselines3[extra] gym

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [5]:
import gymnasium as gym
from stable_baselines3 import A2C

# Create the environment
env = gym.make('CartPole-v1')

# Initialize the A2C model
model = A2C('MlpPolicy', env, verbose=1)

# Train the model
model.learn(total_timesteps=10000)

# Save the model
model.save("a2c_cartpole")

# Load the model (if needed)
# model = A2C.load("a2c_cartpole")

# Evaluate the model
obs, _ = env.reset()  # Extract only obs
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, terminated, truncated, info = env.step(action)
    env.render()  # Render the environment
    if terminated or truncated:
        obs, _ = env.reset()  # Extract only obs again

env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 28.4     |
|    ep_rew_mean        | 28.4     |
| time/                 |          |
|    fps                | 650      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.689   |
|    explained_variance | 0.0778   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.7      |
|    value_loss         | 9.35     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 36.5     |
|    ep_rew_mean        | 36.5     |
| time/                 |          |
|    fps                | 643      |
|    iterations         | 200      |
|    time_elapsed 

In [9]:
import numpy as np
import gymnasium as gym  # Use gymnasium instead of gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the Actor model
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.fc = nn.Linear(input_dim, 128)
        self.actor = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        return torch.softmax(self.actor(x), dim=-1)

# Define the Critic model
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.fc = nn.Linear(input_dim, 128)
        self.critic = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        return self.critic(x)

# A2C algorithm
def a2c(env, num_episodes=1000, gamma=0.99, learning_rate=0.001):
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    actor = Actor(input_dim, output_dim)
    critic = Critic(input_dim)
    optimizer = optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=learning_rate)

    for episode in range(num_episodes):
        state, _ = env.reset()  # Extract only state

        done = False
        rewards = []
        log_probs = []
        values = []

        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            value = critic(state_tensor)

            dist = Categorical(action_probs)
            action = dist.sample()

            log_prob = dist.log_prob(action)
            next_state, reward, terminated, truncated, _ = env.step(action.item())

            done = terminated or truncated  # Consider both termination cases

            rewards.append(reward)
            log_probs.append(log_prob)
            values.append(value)

            state = next_state

        # Compute returns and advantages
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)

        returns = torch.FloatTensor(returns)
        log_probs = torch.cat(log_probs)
        values = torch.cat(values).squeeze()

        # Compute advantages
        advantages = returns - values.detach()

        # Update the model
        actor_loss = -(log_probs * advantages).mean()
        critic_loss = (returns - values).pow(2).mean()
        loss = actor_loss + critic_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f'Episode {episode}, Total Reward: {sum(rewards)}')

    return actor, critic

# Main function to run the A2C algorithm
if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    actor, critic = a2c(env)
    env.close()


Episode 0, Total Reward: 20.0
Episode 100, Total Reward: 31.0
Episode 200, Total Reward: 27.0
Episode 300, Total Reward: 119.0
Episode 400, Total Reward: 500.0
Episode 500, Total Reward: 500.0
Episode 600, Total Reward: 500.0
Episode 700, Total Reward: 500.0
Episode 800, Total Reward: 500.0
Episode 900, Total Reward: 500.0
