<a href="https://colab.research.google.com/github/baldeoJV/CCRNFLRL_EXERCISE_COM221-ML/blob/main/Copy_of_Exercise7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚀 Exercise 7

In this exercise, you will train a **Deep Q-Network (DQN)** to play the
[CartPole-v1 environment](https://www.gymlibrary.dev/environments/toy_text/frozen_lake/) from Gymnasium.

`CartPole-v1` is a classic reinforcement learning task where a pole is attached to a cart that moves along a horizontal track . The **goal**: is to keep the pole upright as long as possible  

**State Space**:  
  1. Cart position  
  2. Cart velocity  
  3. Pole angle  
  4. Pole angular velocity  

**Action Space**:  
  1. Push cart to the **left**  
  2. Push cart to the **right**  

**Reward**: +1 for every timestep the pole remains upright  

**Episode ends** when:  
  - The pole angle exceeds ±12°  
  - The cart moves beyond the edges of the track  
  - Maximum of 500 timesteps is reached


## Step 1: Install dependencies



In [11]:
!pip install gymnasium torch numpy matplotlib imageio[ffmpeg] --quiet

## Step 2: Imports


In [12]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video


# Select GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



Using device: cuda


## Step 3: Define the Deep Q-Network



In [13]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 80)
        self.fc2 = nn.Linear(80, 80)
        self.fc3 = nn.Linear(80, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

## Step 4: Replay Buffer

We store past experiences `(state, action, reward, next_state, done)`  
in a buffer and sample random minibatches for training.  
This **breaks correlation** between consecutive samples and stabilizes learning.


In [14]:
class ReplayBuffer:
    def __init__(self, capacity=5000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.FloatTensor(states),
            torch.LongTensor(actions),
            torch.FloatTensor(rewards),
            torch.FloatTensor(next_states),
            torch.FloatTensor(dones),
        )

    def __len__(self):
        return len(self.buffer)


## Step 5: Training Function




In [15]:
def train_dqn(episodes,
              gamma,
              lr,
              batch_size,
              epsilon_decay,
              min_epsilon,
              env_name="CartPole-v1"):

    env = gym.make(env_name)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # q_net = DQN(state_size, action_size)
    # target_net = DQN(state_size, action_size)
    q_net = DQN(state_size, action_size).to(device)
    target_net = DQN(state_size, action_size).to(device)
    target_net.load_state_dict(q_net.state_dict())
    optimizer = optim.Adam(q_net.parameters(), lr=lr)

    memory = ReplayBuffer(10000) #5000 originally
    epsilon = 1.0
    rewards_history = []

    for ep in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            # ε-greedy policy
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                state_tensor = torch.FloatTensor(state).to(device)

                with torch.no_grad():
                    action = torch.argmax(q_net(state_tensor)).item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            memory.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            # Train network
            # if len(memory) >= batch_size:
            #     states, actions, rewards, next_states, dones = memory.sample(batch_size)
            #     q_values = q_net(states).gather(1, actions.unsqueeze(1)).squeeze()
            #     next_q = target_net(next_states).max(1)[0].detach()
            #     targets = rewards + gamma * next_q * (1 - dones)

            #     loss = nn.MSELoss()(q_values, targets)
            #     optimizer.zero_grad()
            #     loss.backward()
            #     optimizer.step()

            # Train network
            if len(memory) >= batch_size:
                states, actions, rewards, next_states, dones = memory.sample(batch_size)

                # 🔹 Move tensors to GPU/CPU device
                states = states.to(device)
                actions = actions.to(device)
                rewards = rewards.to(device)
                next_states = next_states.to(device)
                dones = dones.to(device)

                q_values = q_net(states).gather(1, actions.unsqueeze(1)).squeeze()
                next_q = target_net(next_states).max(1)[0].detach()
                targets = rewards + gamma * next_q * (1 - dones)

                loss = nn.MSELoss()(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()


        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        rewards_history.append(total_reward)

        # Update target network periodically
        if ep % 50 == 0:
            target_net.load_state_dict(q_net.state_dict())
            print(f"Episode {ep}, Reward: {total_reward}, Epsilon: {epsilon:.2f}")

    env.close()
    return q_net, rewards_history

## Step 6: Train the Agent

In [16]:
#168
# episodes = 1000           # total number of episodes to train
# gamma = 0.85            # discount factor for future rewards
# lr = 0.001               # learning rate for the optimizer
# batch_size = 64         # how many experiences to sample from replay buffer
# epsilon_decay = 0.995   # rate at which epsilon decreases per episode
# min_epsilon = 0.01      # minimum exploration rate


episodes = 1000           # total number of episodes to train
gamma = 0.90            # discount factor for future rewards
lr = 0.0005               # learning rate for the optimizer
batch_size = 64         # how many experiences to sample from replay buffer
epsilon_decay = 0.995   # rate at which epsilon decreases per episode
min_epsilon = 0.01      # minimum exploration rate


In [17]:
q_net, rewards = train_dqn(episodes=episodes, gamma=gamma, lr=lr, batch_size=batch_size, epsilon_decay=epsilon_decay, min_epsilon=min_epsilon)

Episode 0, Reward: 11.0, Epsilon: 0.99
Episode 50, Reward: 12.0, Epsilon: 0.77
Episode 100, Reward: 11.0, Epsilon: 0.60
Episode 150, Reward: 14.0, Epsilon: 0.47
Episode 200, Reward: 23.0, Epsilon: 0.37
Episode 250, Reward: 50.0, Epsilon: 0.28
Episode 300, Reward: 184.0, Epsilon: 0.22
Episode 350, Reward: 237.0, Epsilon: 0.17
Episode 400, Reward: 175.0, Epsilon: 0.13
Episode 450, Reward: 156.0, Epsilon: 0.10
Episode 500, Reward: 118.0, Epsilon: 0.08
Episode 550, Reward: 100.0, Epsilon: 0.06
Episode 600, Reward: 106.0, Epsilon: 0.05
Episode 650, Reward: 128.0, Epsilon: 0.04
Episode 700, Reward: 153.0, Epsilon: 0.03
Episode 750, Reward: 76.0, Epsilon: 0.02
Episode 800, Reward: 199.0, Epsilon: 0.02
Episode 850, Reward: 500.0, Epsilon: 0.01
Episode 900, Reward: 257.0, Epsilon: 0.01
Episode 950, Reward: 26.0, Epsilon: 0.01


## Step 7: Record Multiple Episodes as Video


In [18]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
video_path = "cartpole_dqn.mp4"
writer = imageio.get_writer(video_path, fps=30)

for ep in range(10):
    state, _ = env.reset()
    done = False
    while not done:
        # state_tensor = torch.FloatTensor(state)
        state_tensor = torch.FloatTensor(state).to(device)
        with torch.no_grad():
            action = torch.argmax(q_net(state_tensor)).item()
        state, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        frame = env.render()
        writer.append_data(frame)

writer.close()
env.close()
print(f"Video saved to {video_path}")




Video saved to cartpole_dqn.mp4


## 8. Watch the video

In [19]:
Video("cartpole_dqn.mp4", embed=True)

## 9. Evaluate Agent

In [20]:
def evaluate_agent(env, q_net, episodes=20):
    total_rewards = []
    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        ep_reward = 0
        while not done:
            # state_tensor = torch.FloatTensor(state)
            state_tensor = torch.FloatTensor(state).to(device)
            with torch.no_grad():
                action = torch.argmax(q_net(state_tensor)).item()
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_reward += reward
        total_rewards.append(ep_reward)
    return np.mean(total_rewards)

eval_env = gym.make("CartPole-v1")
avg_reward = evaluate_agent(eval_env, q_net, episodes=20)
print(f"Average reward over 20 eval episodes: {avg_reward:.2f}")
eval_env.close()

Average reward over 20 eval episodes: 459.60
