In [1]:
import os
import time
import random
from collections import deque, namedtuple
from typing import Tuple, List, Deque
import math
import copy

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
import wandb

# --- 1. Introduction: Hyperparameter Tuning Notebook ---

# This notebook is designed for running systematic hyperparameter tuning experiments.
# It allows you to define multiple experiment configurations and runs them sequentially,
# saving the results, trained models, and evaluation videos in separate folders for
# easy comparison. This is ideal for use on platforms like Kaggle.

# --- Device Detection ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# --- WandB Configuration (Optional) ---
# Set USE_WANDB to False if you do not want to log results.
USE_WANDB = True
PROJECT_NAME = "RL-Assignment2-Hyperparameter-Tuning"

if USE_WANDB:
    try:
        # This is a placeholder for Kaggle secrets or local key file
        # On Kaggle, you would use `from kaggle_secrets import UserSecretsClient`
        if os.path.exists("key.txt"):
            with open("key.txt", "r") as f:
                api_key = f.read().strip()
            wandb.login(key=api_key)
            print("WandB login successful.")
        else:
            print("WandB key file not found. Set USE_WANDB to False or provide a key.")
            USE_WANDB = False
    except Exception as e:
        print(f"Could not log in to WandB: {e}")
        USE_WANDB = False

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\abdul\_netrc
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\abdul\_netrc


Using device: cuda


[34m[1mwandb[0m: Currently logged in as: [33mabdelrahmant3[0m ([33mabdelrahmant3-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB login successful.


## 2. Hyperparameter Experiments

Instead of a single config, we define a list of `experiments`. Each item in the list is a dictionary containing:
- `name`: A unique name for the experiment (used for creating folders).
- `config`: A dictionary of hyperparameters for that specific experiment.

This structure allows us to test many different configurations in a single run.

In [2]:
# --- Define All Experiment Configurations Here ---

experiments = [
    {
        "name": "Baseline_DQN",
        "config": {
            "USE_DDQN": False,
            "BATCH_SIZE": 128,
            "GAMMA": 0.99,
            "EPS_DECAY": 1000,
            "TAU": 0.005,
            "LR": 1e-4,
            "REPLAY_MEMORY_SIZE": 10000,
            "NET_ARCHITECTURE": [128, 128],
        }
    },
    {
        "name": "Baseline_DDQN",
        "config": {
            "USE_DDQN": True,
            "BATCH_SIZE": 128,
            "GAMMA": 0.99,
            "EPS_DECAY": 1000,
            "TAU": 0.005,
            "LR": 1e-4,
            "REPLAY_MEMORY_SIZE": 10000,
            "NET_ARCHITECTURE": [128, 128],
        }
    },
    {
        "name": "High_LR_Fast_Decay",
        "config": {
            "USE_DDQN": True,
            "BATCH_SIZE": 256,
            "GAMMA": 0.99,
            "EPS_DECAY": 500,  # Faster decay
            "TAU": 0.01, # Faster target update
            "LR": 5e-4,      # Higher learning rate
            "REPLAY_MEMORY_SIZE": 10000,
            "NET_ARCHITECTURE": [128, 128],
        }
    },
    {
        "name": "Deeper_Network_More_Exploration",
        "config": {
            "USE_DDQN": True,
            "BATCH_SIZE": 128,
            "GAMMA": 0.99,
            "EPS_DECAY": 2000, # Slower decay, more exploration
            "TAU": 0.005,
            "LR": 1e-4,
            "REPLAY_MEMORY_SIZE": 50000,
            "NET_ARCHITECTURE": [256, 256, 128], # Deeper network
        }
    },
]

# Common parameters that don't change between experiments
config = {
    "EPS_START": 0.9,
    "EPS_END": 0.05,
}

---
## 3. Core DQN Components

This section defines the three fundamental building blocks of the agent:
1.  **DQN Model**: A simple feed-forward neural network that estimates Q-values.
2.  **Replay Memory**: A buffer that stores past experiences (`state`, `action`, `reward`, `next_state`) so the agent can learn from them in batches. This decorrelates experiences and stabilizes training.
3.  **Transition**: A `namedtuple` for conveniently storing a single experience.

In [3]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    # ... (ReplayMemory class remains the same)
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):
    """
    A dynamic DQN model that builds its architecture based on a list of layer sizes.
    """
    def __init__(self, n_observations, n_actions, net_arch):
        super(DQN, self).__init__()
        layers = []
        input_size = n_observations
        for output_size in net_arch:
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
            input_size = output_size
        layers.append(nn.Linear(input_size, n_actions))
        
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

---
### Discretization Wrapper for Continuous Environments

Since our DQN agent can only output discrete actions (e.g., action 0, 1, 2), it cannot directly handle environments with continuous action spaces like `Pendulum-v1`.

To solve this, we create a **wrapper** class. This class sits "on top" of the original environment and modifies its behavior. The `DiscretizeActionWrapper` does the following:
1.  It takes the continuous action space (e.g., a range from -2.0 to 2.0 for Pendulum) and converts it into a fixed number of discrete actions (e.g., 5 bins).
2.  It tells our agent that there are now 5 possible actions.
3.  When our agent picks a discrete action (e.g., action `2`), the wrapper translates it back into the corresponding continuous value (e.g., `0.0`) before passing it to the actual environment.

This allows us to use our DQN agent on `Pendulum-v1` without changing the agent's core logic.

In [4]:
class DiscretizeActionWrapper(gym.Wrapper):
    """
    A wrapper to discretize a continuous action space.
    """
    def __init__(self, env, n_bins):
        super().__init__(env)
        self.n_bins = n_bins
        self.action_space = gym.spaces.Discrete(n_bins)
        self.continuous_action_space = env.action_space
        
        # Create a mapping from discrete actions to continuous values
        self.action_map = np.linspace(
            self.continuous_action_space.low[0],
            self.continuous_action_space.high[0],
            n_bins
        )

    def step(self, action):
        # Map the discrete action to a continuous action
        continuous_action = np.array([self.action_map[action]], dtype=np.float32)
        return self.env.step(continuous_action)

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

---
## 4. Training and Optimization Functions

This section contains the core logic for the agent's learning process.

- **`select_action`**: Implements an epsilon-greedy policy. With probability `epsilon`, it takes a random action (exploration). Otherwise, it takes the action with the highest predicted Q-value (exploitation).
- **`optimize_model`**: This is the heart of the learning algorithm. It samples a batch of experiences from the replay memory and computes the loss. It supports both standard DQN and Double DQN (DDQN) based on the `USE_DDQN` flag in the config.
- **`get_env_details`**: A helper to get the action and observation space sizes from an environment.

In [5]:
steps_done = 0

def select_action(state, env, policy_net, n_actions, current_config):
    global steps_done
    sample = random.random()
    eps_threshold = current_config["EPS_END"] + (current_config["EPS_START"] - current_config["EPS_END"]) * \
        math.exp(-1. * steps_done / current_config["EPS_DECAY"])
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

def optimize_model(memory, policy_net, target_net, optimizer, current_config):
    if len(memory) < current_config["BATCH_SIZE"]:
        return None
    transitions = memory.sample(current_config["BATCH_SIZE"])
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(current_config["BATCH_SIZE"], device=device)
    with torch.no_grad():
        if current_config["USE_DDQN"]:
            best_actions = policy_net(non_final_next_states).argmax(1).unsqueeze(-1)
            next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, best_actions).squeeze()
        else:
            next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
            
    expected_state_action_values = (next_state_values * current_config["GAMMA"]) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss.item()

---
## 5. Main Experiment Runner

This is the main execution block. It iterates through every `experiment` defined in Cell 2.

For each experiment, it:
1.  Creates a unique directory to store the trained models and videos.
2.  Merges the specific experiment config with the base config.
3.  Loops through each environment (`CartPole`, `Acrobot`, etc.).
4.  Trains the agent using the specified hyperparameters.
5.  Saves the trained `policy_net` to the experiment's directory.
6.  Runs a full evaluation and saves the video recordings to the experiment's directory.
7.  Prints a final summary of all results.

**This is the only cell you need to run to start the entire tuning process.**

In [6]:
# --- Main Experiment Runner ---
environments = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0", "Pendulum-v1"]
overall_results = {}

for experiment in experiments:
    exp_name = experiment["name"]
    exp_config = {**config, **experiment["config"]} # Merge base and experiment-specific configs
    algo_name = "DDQN" if exp_config["USE_DDQN"] else "DQN"
    
    print(f"\n{'='*60}\nRunning Experiment: {exp_name}\n{'='*60}\n")
    
    # Create a dedicated folder for this experiment's outputs
    output_dir = f"./{exp_name}/"
    os.makedirs(output_dir, exist_ok=True)
    
    overall_results[exp_name] = {}

    for env_name in environments:
        print(f"\n--- Environment: {env_name} ---\n")
        
        # --- Environment Setup ---
        is_continuous = False
        temp_env = gym.make(env_name)
        if isinstance(temp_env.action_space, gym.spaces.Box):
            is_continuous = True
        temp_env.close()

        if is_continuous:
            env = DiscretizeActionWrapper(gym.make(env_name), n_bins=11)
        else:
            env = gym.make(env_name)
            
        n_actions = env.action_space.n
        state, _ = env.reset()
        n_observations = len(state)
        env.reset()

        # --- Model Initialization ---
        policy_net = DQN(n_observations, n_actions, exp_config["NET_ARCHITECTURE"]).to(device)
        target_net = DQN(n_observations, n_actions, exp_config["NET_ARCHITECTURE"]).to(device)
        target_net.load_state_dict(policy_net.state_dict())

        optimizer = optim.AdamW(policy_net.parameters(), lr=exp_config["LR"], amsgrad=True)
        memory = ReplayMemory(exp_config["REPLAY_MEMORY_SIZE"])
        
        steps_done = 0
        
        # --- WandB Setup ---
        run_name = f"{exp_name}_{env_name}"
        if USE_WANDB:
            if wandb.run is not None: wandb.finish()
            wandb.init(project=PROJECT_NAME, name=run_name, config=exp_config, reinit=True)

        # --- Training Loop ---
        num_episodes = 2000 if env_name == "MountainCar-v0" else 1000
        for i_episode in range(num_episodes):
            state, info = env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            ep_reward = 0
            
            for t in range(1000):
                action = select_action(state, env, policy_net, n_actions, exp_config)
                observation, reward, terminated, truncated, _ = env.step(action.item())
                ep_reward += reward
                reward = torch.tensor([reward], device=device)
                done = terminated or truncated

                if terminated: next_state = None
                else: next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

                memory.push(state, action, next_state, reward)
                state = next_state

                loss = optimize_model(memory, policy_net, target_net, optimizer, exp_config)
                
                target_net_state_dict = target_net.state_dict()
                policy_net_state_dict = policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key]*exp_config["TAU"] + target_net_state_dict[key]*(1-exp_config["TAU"])
                target_net.load_state_dict(target_net_state_dict)

                if done: break
            
            if USE_WANDB and loss is not None: wandb.log({"reward": ep_reward, "loss": loss}, step=i_episode)
            if i_episode % 100 == 0: print(f"  Episode {i_episode}/{num_episodes} | Reward: {ep_reward}")

        print("--- Training Complete ---")
        model_path = os.path.join(output_dir, f"{algo_name}_{env_name}_policy.pth")
        torch.save(policy_net.state_dict(), model_path)
        print(f"Model saved to {model_path}")
        if USE_WANDB: wandb.finish()

        # --- Evaluation ---
        print(f"--- Evaluating for 100 episodes (with video recording) ---")
        video_folder = os.path.join(output_dir, "videos", f"{algo_name}_{env_name}")
        os.makedirs(video_folder, exist_ok=True)

        if is_continuous:
            eval_env = DiscretizeActionWrapper(gym.make(env_name, render_mode="rgb_array"), n_bins=11)
        else:
            eval_env = gym.make(env_name, render_mode="rgb_array")
        
        eval_env = gym.wrappers.RecordVideo(eval_env, video_folder, episode_trigger=lambda x: x % 20 == 0, name_prefix=f"{exp_name}-{env_name}")

        total_eval_reward = 0
        for i in range(100):
            state, _ = eval_env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            ep_reward = 0
            done = False
            while not done:
                with torch.no_grad():
                    action = policy_net(state).max(1)[1].view(1, 1)
                obs, reward, terminated, truncated, _ = eval_env.step(action.item())
                ep_reward += reward
                done = terminated or truncated
                if not done: state = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
            total_eval_reward += ep_reward
        
        avg_reward = total_eval_reward / 100
        overall_results[exp_name][env_name] = avg_reward
        print(f"Average Reward over 100 episodes: {avg_reward:.2f}")
        print(f"Videos saved in: {video_folder}\n")
        
        eval_env.close()
        env.close()

# --- Final Summary ---
print("\n\n--- Overall Evaluation Summary ---")
for exp_name, results in overall_results.items():
    print(f"\n--- Experiment: {exp_name} ---")
    for env_name, avg_reward in results.items():
        print(f"  {env_name}: Average Reward (100 eps) = {avg_reward:.2f}")
print("------------------------------------")


Running Experiment: Baseline_DQN


--- Environment: CartPole-v1 ---





  Episode 0/500 | Reward: 20.0
  Episode 100/500 | Reward: 12.0
  Episode 100/500 | Reward: 12.0
  Episode 200/500 | Reward: 132.0
  Episode 200/500 | Reward: 132.0
  Episode 300/500 | Reward: 110.0
  Episode 300/500 | Reward: 110.0
  Episode 400/500 | Reward: 288.0
  Episode 400/500 | Reward: 288.0


KeyboardInterrupt: 