In [15]:
!pip install -q gym[classic_control] torch numpy matplotlib moviepy


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [17]:
!pip install numpy==1.23.5 --upgrade --force-reinstall

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
moviepy 2.2.1 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.23.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[3

In [18]:
!pip install imageio imageio-ffmpeg


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Load the environment

In [4]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# ✅ Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 🎮 Initialize environment
env = gym.make("CartPole-v1")

state_dim = env.observation_space.shape[0]  # 4
action_dim = env.action_space.n             # 2
print(state_dim)
print(action_dim)

Using device: cuda
4
2


# Define a network

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Layers
W_shared = nn.Linear(state_dim, 128).to(device)
W_policy = nn.Linear(128, action_dim).to(device)
W_value = nn.Linear(128, 1).to(device)

# Optimizer
optimizer = optim.Adam(list(W_shared.parameters()) +
                       list(W_policy.parameters()) +
                       list(W_value.parameters()), lr=1e-3)

# Hyperparameters
gamma = 0.99
episodes = 1000
episode_rewards = []

for episode in range(episodes):
    state, _ = env.reset()
    done = False

    log_probs = []
    values = []
    rewards = []

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        # Forward pass
        x = F.relu(W_shared(state_tensor))
        logits = W_policy(x)
        probs = F.softmax(logits, dim=-1)
        value = W_value(x)

        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        # Store
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(reward)

        state = next_state

    # Compute returns
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)

    returns = torch.tensor(returns, dtype=torch.float32).to(device)
    values = torch.cat(values)
    log_probs = torch.stack(log_probs)
    advantage = returns - values.squeeze()

    # Losses
    actor_loss = -(log_probs * advantage.detach()).mean()
    critic_loss = advantage.pow(2).mean()
    loss = actor_loss + 0.5 * critic_loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_reward = sum(rewards)
    episode_rewards.append(total_reward)

    if (episode + 1) % 10 == 0:
        print(f"Episode {episode+1}, Reward: {total_reward:.2f}")

# Plot results
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Actor-Critic on CartPole-v1")
plt.grid(True)
plt.show()


Episode 10, Reward: 24.00
Episode 20, Reward: 32.00
Episode 30, Reward: 18.00
Episode 40, Reward: 9.00
Episode 50, Reward: 25.00
Episode 60, Reward: 23.00
Episode 70, Reward: 24.00
Episode 80, Reward: 15.00
Episode 90, Reward: 22.00
Episode 100, Reward: 21.00
Episode 110, Reward: 14.00
Episode 120, Reward: 78.00
Episode 130, Reward: 16.00
Episode 140, Reward: 56.00
Episode 150, Reward: 32.00
Episode 160, Reward: 22.00
Episode 170, Reward: 23.00
Episode 180, Reward: 50.00
Episode 190, Reward: 25.00
Episode 200, Reward: 42.00
Episode 210, Reward: 34.00
Episode 220, Reward: 40.00
Episode 230, Reward: 16.00
Episode 240, Reward: 49.00
Episode 250, Reward: 34.00
Episode 260, Reward: 58.00
Episode 270, Reward: 77.00
Episode 280, Reward: 20.00
Episode 290, Reward: 58.00
Episode 300, Reward: 26.00
Episode 310, Reward: 28.00
Episode 320, Reward: 49.00
Episode 330, Reward: 72.00
Episode 340, Reward: 30.00
Episode 350, Reward: 38.00
Episode 360, Reward: 35.00
Episode 370, Reward: 62.00
Episode 380

Exception ignored in: <function VideoRecorder.__del__ at 0x77a8bab24b80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py", line 178, in __del__
    self.close()
  File "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py", line 141, in close
    if not self.enabled or self._closed:
AttributeError: 'VideoRecorder' object has no attribute 'enabled'


Episode 730, Reward: 56.00
Episode 740, Reward: 123.00
Episode 750, Reward: 59.00
