In [12]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

In [18]:

# Hyperparameters
learning_rate = 1e-3
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 500
batch_size = 64
buffer_limit = 10000
min_buffer = 1000
target_update_freq = 10
episodes = 500

# Q-network
class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        return self.fc(x)

# Replay buffer
class ReplayBuffer:
    def __init__(self):
        self.buffer = deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self):
        batch = random.sample(self.buffer, batch_size)
        s, a, r, s_prime, done = zip(*batch)
        return (
            torch.tensor(s, dtype=torch.float),
            torch.tensor(a),
            torch.tensor(r),
            torch.tensor(s_prime, dtype=torch.float),
            torch.tensor(done, dtype=torch.float)
        )

    def size(self):
        return len(self.buffer)

# Epsilon-greedy policy
def epsilon_greedy(q_net, state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, 1)
    else:
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            q_values = q_net(state)
            return q_values.argmax().item()

# Main training loop
env = gym.make("CartPole-v1", render_mode="rgb_array")
q_net = QNet()
target_net = QNet()
target_net.load_state_dict(q_net.state_dict())
optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)
memory = ReplayBuffer()

def train(q_net, target_net, memory, optimizer):
    s, a, r, s_prime, done = memory.sample()

    q_out = q_net(s)
    q_a = q_out.gather(1, a.unsqueeze(1)).squeeze(1)

    with torch.no_grad():
        max_q_prime = target_net(s_prime).max(1)[0]
        target = r + gamma * max_q_prime * (1 - done)

    loss = nn.MSELoss()(q_a, target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

for episode in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)

    while not done:
        action = epsilon_greedy(q_net, state, epsilon)
        next_state, reward, done, _, _ = env.step(action)

        memory.put((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if memory.size() >= min_buffer:
            train(q_net, target_net, memory, optimizer)

    if episode % target_update_freq == 0:
        target_net.load_state_dict(q_net.state_dict())

    print(f"Episode {episode+1}, Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

env.close()

Episode 1, Reward: 49.00, Epsilon: 1.000
Episode 2, Reward: 27.00, Epsilon: 0.998
Episode 3, Reward: 15.00, Epsilon: 0.996
Episode 4, Reward: 16.00, Epsilon: 0.994
Episode 5, Reward: 22.00, Epsilon: 0.992
Episode 6, Reward: 22.00, Epsilon: 0.990
Episode 7, Reward: 20.00, Epsilon: 0.988
Episode 8, Reward: 10.00, Epsilon: 0.986
Episode 9, Reward: 12.00, Epsilon: 0.984
Episode 10, Reward: 53.00, Epsilon: 0.982
Episode 11, Reward: 30.00, Epsilon: 0.980
Episode 12, Reward: 15.00, Epsilon: 0.978
Episode 13, Reward: 13.00, Epsilon: 0.977
Episode 14, Reward: 31.00, Epsilon: 0.975
Episode 15, Reward: 19.00, Epsilon: 0.973
Episode 16, Reward: 19.00, Epsilon: 0.971
Episode 17, Reward: 37.00, Epsilon: 0.969
Episode 18, Reward: 50.00, Epsilon: 0.967
Episode 19, Reward: 12.00, Epsilon: 0.965
Episode 20, Reward: 22.00, Epsilon: 0.963
Episode 21, Reward: 21.00, Epsilon: 0.961
Episode 22, Reward: 15.00, Epsilon: 0.959
Episode 23, Reward: 14.00, Epsilon: 0.957
Episode 24, Reward: 13.00, Epsilon: 0.955
E

In [14]:
def evaluate_agent(q_net, env, episodes=20, render=False):
    total_rewards = []

    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0

        while not done:
            if render:
                env.render()

            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0)
                action = q_net(state_tensor).argmax().item()

            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward

        total_rewards.append(total_reward)

    avg_reward = np.mean(total_rewards)
    print(f"\nEvaluation over {episodes} episodes: Average Reward = {avg_reward:.2f}")
    return avg_reward

In [None]:
import gym
from gym.wrappers import RecordVideo
import os
from IPython.display import Video

def record_agent(q_net, episodes=1, video_dir='videos'):
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    env = RecordVideo(env, video_dir=video_dir, episode_trigger=lambda x: True)
    q_net.eval()

    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        while not done:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0)
                action = q_net(state_tensor).argmax().item()
            state, _, done, _, _ = env.step(action)

    env.close()

    # Return video file path
    video_file = sorted(os.listdir(video_dir))[-1]
    return os.path.join(video_dir, video_file)

print(record_agent(q_net))

In [23]:
evaluate_agent(q_net, env, render=True)


Evaluation over 20 episodes: Average Reward = 288.00


288.0

In [24]:
!pip install moviepy


Collecting moviepy
  Downloading moviepy-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.12-py3-none-any.whl.metadata (794 bytes)
Collecting python-dotenv>=0.10 (from moviepy)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading moviepy-2.2.1-py3-none-any.whl (129 kB)
Downloading imageio-2.37.0-py3-none-any.whl (315 kB)
Downloading imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl (31.2 MB)
   ---------------------------------------- 0.0/31.2 MB ? eta -:--:--
   - -------------------------------------- 0.8/31.2 MB 4.2 MB/s eta 0:00:08
   -- ------------------------------------- 1.6/31.2 MB 4.4 MB/s eta 0:00:07
   --- ------------------------------------ 2.4/31.2 MB 3.8 MB/s eta 