In [13]:
import os
import time
import random
from collections import deque, namedtuple
from typing import Tuple, List, Deque
import math

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
import wandb

# --- 1. Introduction and Setup ---

# This notebook implements the DQN and DDQN algorithms based on a more traditional,
# function-oriented style inspired by PyTorch tutorials. It is adapted to fulfill
# the assignment requirements, including training on multiple environments and
# running comprehensive evaluations.

# --- Device Detection ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# --- WandB Configuration (Optional) ---
# Set USE_WANDB to False if you do not want to log results.
USE_WANDB = True
PROJECT_NAME = "RL-Assignment2-Handmade"

if USE_WANDB:
    try:
        # Fix for Windows environments to prevent symlink errors
        os.environ['WANDB_CONSOLE'] = 'off'
        
        # Read API key from a local file
        with open("key.txt", "r") as f:
            api_key = f.read().strip()
        wandb.login(key=api_key)
        print("WandB login successful.")
    except Exception as e:
        print(f"Could not log in to WandB: {e}")
        USE_WANDB = False



Using device: cuda
WandB login successful.


---
## 2. Hyperparameters and Configuration

This cell centralizes all tunable parameters for the experiments. The assignment requires testing different values for these to find the best setup.

In [14]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the AdamW optimizer
config = {
    "USE_DDQN": True,
    "BATCH_SIZE": 128,
    "GAMMA": 0.99,
    "EPS_START": 0.9,
    "EPS_END": 0.05,
    "EPS_DECAY": 1000,
    "TAU": 0.005,
    "LR": 1e-4,
    "REPLAY_MEMORY_SIZE": 10000,
}

---
## 3. Core DQN Components

This section defines the three fundamental building blocks of the agent:
1.  **DQN Model**: A simple feed-forward neural network that estimates Q-values.
2.  **Replay Memory**: A buffer that stores past experiences (`state`, `action`, `reward`, `next_state`) so the agent can learn from them in batches. This decorrelates experiences and stabilizes training.
3.  **Transition**: A `namedtuple` for conveniently storing a single experience.

In [15]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 512)
        self.layer2 = nn.Linear(512, 512)
        self.layer3 = nn.Linear(512, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

---
### Discretization Wrapper for Continuous Environments

Since our DQN agent can only output discrete actions (e.g., action 0, 1, 2), it cannot directly handle environments with continuous action spaces like `Pendulum-v1`.

To solve this, we create a **wrapper** class. This class sits "on top" of the original environment and modifies its behavior. The `DiscretizeActionWrapper` does the following:
1.  It takes the continuous action space (e.g., a range from -2.0 to 2.0 for Pendulum) and converts it into a fixed number of discrete actions (e.g., 5 bins).
2.  It tells our agent that there are now 5 possible actions.
3.  When our agent picks a discrete action (e.g., action `2`), the wrapper translates it back into the corresponding continuous value (e.g., `0.0`) before passing it to the actual environment.

This allows us to use our DQN agent on `Pendulum-v1` without changing the agent's core logic.

In [16]:
class DiscretizeActionWrapper(gym.Wrapper):
    """
    A wrapper to discretize a continuous action space.
    """
    def __init__(self, env, n_bins):
        super().__init__(env)
        self.n_bins = n_bins
        self.action_space = gym.spaces.Discrete(n_bins)
        self.continuous_action_space = env.action_space
        
        # Create a mapping from discrete actions to continuous values
        self.action_map = np.linspace(
            self.continuous_action_space.low[0],
            self.continuous_action_space.high[0],
            n_bins
        )

    def step(self, action):
        # Map the discrete action to a continuous action
        continuous_action = np.array([self.action_map[action]], dtype=np.float32)
        return self.env.step(continuous_action)

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

---
## 4. Training and Optimization Functions

This section contains the core logic for the agent's learning process.

- **`select_action`**: Implements an epsilon-greedy policy. With probability `epsilon`, it takes a random action (exploration). Otherwise, it takes the action with the highest predicted Q-value (exploitation).
- **`optimize_model`**: This is the heart of the learning algorithm. It samples a batch of experiences from the replay memory and computes the loss. It supports both standard DQN and Double DQN (DDQN) based on the `USE_DDQN` flag in the config.
- **`get_env_details`**: A helper to get the action and observation space sizes from an environment.

In [17]:
steps_done = 0

def select_action(state, env, policy_net, n_actions):
    global steps_done
    sample = random.random()
    eps_threshold = config["EPS_END"] + (config["EPS_START"] - config["EPS_END"]) * \
        math.exp(-1. * steps_done / config["EPS_DECAY"])
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

def optimize_model(memory, policy_net, target_net, optimizer):
    if len(memory) < config["BATCH_SIZE"]:
        return
    transitions = memory.sample(config["BATCH_SIZE"])
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(config["BATCH_SIZE"], device=device)
    # with torch.no_grad():
    if config["USE_DDQN"]:
        # DDQN: Use policy_net to select actions, and target_net to evaluate them
        best_actions = policy_net(non_final_next_states).argmax(1).unsqueeze(-1)
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, best_actions).squeeze()
    else:
        # Standard DQN: Use target_net for both selection and evaluation
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
            
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * config["GAMMA"]) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss.item()

---
## 5. Main Training and Evaluation Loop

This is the final, runnable part of the notebook. It automates the entire process as required by the assignment:
1.  It defines the list of environments to be trained on.
2.  It loops through both **DQN and DDQN** algorithms.
3.  For each algorithm and each environment, it:
    *   Initializes the policy and target networks, optimizer, and replay memory.
    *   Runs the main training loop for a set number of episodes.
    *   Logs metrics to WandB (if enabled).
    *   Updates the target network periodically.
    *   Saves the final trained model.
4.  After training, it runs a **100-episode evaluation** to test the agent's performance and prints the average reward.
5.  Finally, it prints a summary of all results.

**This is the only cell you need to run to complete the assignment.**

In [18]:
# --- Environments to run ---
environments = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0", "Pendulum-v1"]
evaluation_results = {}

# --- Main Experiment Loop ---
for use_ddqn_flag in [False, True]:
    algo_name = "DDQN" if use_ddqn_flag else "DQN"
    config["USE_DDQN"] = use_ddqn_flag
    print(f"\n{'='*40}\nRunning Experiment with: {algo_name}\n{'='*40}\n")
    
    if algo_name not in evaluation_results:
        evaluation_results[algo_name] = {}

    for env_name in environments:
        model_path = f"{algo_name}_{env_name}_policy.pth"

        # --- Environment Setup ---
        is_continuous = False
        temp_env = gym.make(env_name)
        if isinstance(temp_env.action_space, gym.spaces.Box):
            is_continuous = True
        temp_env.close()

        if is_continuous:
            print(f"--- Detected continuous action space for {env_name}. Applying discretization wrapper. ---")
            # For Pendulum, we discretize the action space into 11 bins
            env = DiscretizeActionWrapper(gym.make(env_name), n_bins=11)
        else:
            env = gym.make(env_name)
            
        # Get action and observation space sizes
        n_actions = env.action_space.n
        state, _ = env.reset()
        n_observations = len(state)
        env.reset() # Reset again to ensure clean state for training

        policy_net = DQN(n_observations, n_actions).to(device)
        
        # --- Check if model is already trained ---
        if os.path.exists(model_path):
            print(f"--- Found pre-trained model for {algo_name} on {env_name} ---")
            print(f"Loading model from {model_path} and skipping training.\n")
            policy_net.load_state_dict(torch.load(model_path))
        else:
            print(f"--- Training {algo_name} on {env_name} ---")
            target_net = DQN(n_observations, n_actions).to(device)
            target_net.load_state_dict(policy_net.state_dict())

            optimizer = optim.AdamW(policy_net.parameters(), lr=config["LR"], amsgrad=True)
            memory = ReplayMemory(config["REPLAY_MEMORY_SIZE"])
            
            steps_done = 0
            
            # --- WandB Setup ---
            run_name = f"{algo_name}_{env_name}_{time.strftime('%Y%m%d-%H%M%S')}"
            if USE_WANDB:
                if wandb.run is not None: wandb.finish()
                wandb.init(project=PROJECT_NAME, name=run_name, config=config, reinit=True)

            # --- Training Loop ---
            num_episodes = 2000 if env_name == "MountainCar-v0" else 600
            for i_episode in range(num_episodes):
                state, info = env.reset()
                state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
                ep_reward = 0
                
                for t in range(1500): # Max steps per episode
                    action = select_action(state, env, policy_net, n_actions)
                    observation, reward, terminated, truncated, _ = env.step(action.item())
                    ep_reward += reward
                    reward = torch.tensor([reward], device=device)
                    done = terminated or truncated

                    if terminated:
                        next_state = None
                    else:
                        next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

                    memory.push(state, action, next_state, reward)
                    state = next_state

                    loss = optimize_model(memory, policy_net, target_net, optimizer)
                    
                    target_net_state_dict = target_net.state_dict()
                    policy_net_state_dict = policy_net.state_dict()
                    for key in policy_net_state_dict:
                        target_net_state_dict[key] = policy_net_state_dict[key]*config["TAU"] + target_net_state_dict[key]*(1-config["TAU"])
                    target_net.load_state_dict(target_net_state_dict)

                    if done:
                        break
                
                if USE_WANDB and loss is not None:
                    wandb.log({"reward": ep_reward, "loss": loss}, step=i_episode)
                
                if i_episode % 50 == 0:
                    print(f"  Episode {i_episode}/{num_episodes} | Reward: {ep_reward}")

            print("--- Training Complete ---")
            torch.save(policy_net.state_dict(), model_path)
            print(f"Model saved to {model_path}")
            if USE_WANDB: wandb.finish()

        # --- Evaluation ---
        print(f"--- Evaluating {algo_name} on {env_name} for 100 episodes (with video recording) ---")
        
        # Create a directory for videos if it doesn't exist
        video_folder = f"./videos/{algo_name}_{env_name}/"
        os.makedirs(video_folder, exist_ok=True)

        # Setup evaluation environment (with discretization if needed)
        if is_continuous:
            eval_env = DiscretizeActionWrapper(gym.make(env_name, render_mode="rgb_array"), n_bins=11)
        else:
            eval_env = gym.make(env_name, render_mode="rgb_array")
        
        # Wrap with video recorder
        eval_env = gym.wrappers.RecordVideo(
            eval_env, 
            video_folder,
            episode_trigger=lambda x: x % 25 == 0, # Record every 25 episodes
            name_prefix=f"{algo_name}-{env_name}"
        )

        total_eval_reward = 0
        for i in range(100):
            state, _ = eval_env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            ep_reward = 0
            done = False
            while not done:
                with torch.no_grad():
                    action = policy_net(state).max(1)[1].view(1, 1)
                obs, reward, terminated, truncated, _ = eval_env.step(action.item())
                ep_reward += reward
                done = terminated or truncated
                if not done:
                    state = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
            total_eval_reward += ep_reward
        
        avg_reward = total_eval_reward / 100
        evaluation_results[algo_name][env_name] = avg_reward
        print(f"Average Reward over 100 episodes: {avg_reward:.2f}")
        print(f"Videos saved in: {video_folder}\n")
        
        eval_env.close()
        env.close()

# --- Final Summary ---
print("\n\n--- Overall Evaluation Summary ---")
for algo_name, results in evaluation_results.items():
    print(f"\n--- {algo_name} Results ---")
    for env_name, avg_reward in results.items():
        print(f"  Environment: {env_name} | Average Reward (100 eps): {avg_reward:.2f}")
print("------------------------------------")


Running Experiment with: DQN

--- Found pre-trained model for DQN on CartPole-v1 ---
Loading model from DQN_CartPole-v1_policy.pth and skipping training.

--- Evaluating DQN on CartPole-v1 for 100 episodes (with video recording) ---


  policy_net.load_state_dict(torch.load(model_path))
  logger.warn(


Average Reward over 100 episodes: 500.00
Videos saved in: ./videos/DQN_CartPole-v1/

--- Found pre-trained model for DQN on Acrobot-v1 ---
Loading model from DQN_Acrobot-v1_policy.pth and skipping training.

--- Evaluating DQN on Acrobot-v1 for 100 episodes (with video recording) ---


  logger.warn(
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Average Reward over 100 episodes: -79.13
Videos saved in: ./videos/DQN_Acrobot-v1/

--- Training DQN on MountainCar-v0 ---


0,1
loss,▁▁▁▁▁▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁
reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,0.0
reward,-200.0


  Episode 0/2000 | Reward: -200.0
  Episode 50/2000 | Reward: -200.0
  Episode 100/2000 | Reward: -200.0
  Episode 150/2000 | Reward: -200.0
  Episode 200/2000 | Reward: -200.0
  Episode 250/2000 | Reward: -200.0
  Episode 300/2000 | Reward: -200.0
  Episode 350/2000 | Reward: -200.0
  Episode 400/2000 | Reward: -200.0
  Episode 450/2000 | Reward: -200.0
  Episode 500/2000 | Reward: -200.0
  Episode 550/2000 | Reward: -200.0
  Episode 600/2000 | Reward: -200.0
  Episode 650/2000 | Reward: -200.0
  Episode 700/2000 | Reward: -200.0
  Episode 750/2000 | Reward: -200.0
  Episode 800/2000 | Reward: -200.0
  Episode 850/2000 | Reward: -200.0
  Episode 900/2000 | Reward: -200.0
  Episode 950/2000 | Reward: -200.0
  Episode 1000/2000 | Reward: -200.0
  Episode 1050/2000 | Reward: -200.0
  Episode 1100/2000 | Reward: -200.0
  Episode 1150/2000 | Reward: -200.0
  Episode 1200/2000 | Reward: -200.0
  Episode 1250/2000 | Reward: -200.0
  Episode 1300/2000 | Reward: -200.0
  Episode 1350/2000 | Re

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DQN_MountainCar-v0_policy.pth


0,1
loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁

0,1
loss,1.03905
reward,-200.0


--- Evaluating DQN on MountainCar-v0 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: -182.38
Videos saved in: ./videos/DQN_MountainCar-v0/

--- Detected continuous action space for Pendulum-v1. Applying discretization wrapper. ---
--- Training DQN on Pendulum-v1 ---


  Episode 0/600 | Reward: -908.053137971162
  Episode 50/600 | Reward: -479.00549464177084
  Episode 100/600 | Reward: -119.08003088743959
  Episode 150/600 | Reward: -123.30691876914841
  Episode 200/600 | Reward: -354.61207961103304
  Episode 250/600 | Reward: -121.88609196495145
  Episode 300/600 | Reward: -126.1223899475885
  Episode 350/600 | Reward: -2.0555501838268295
  Episode 400/600 | Reward: -362.60375621226285
  Episode 450/600 | Reward: -238.10322862520079
  Episode 500/600 | Reward: -327.7842613121625
  Episode 550/600 | Reward: -123.76596700220102


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DQN_Pendulum-v1_policy.pth


0,1
loss,▂▂▅██▂▃▂▁▂▄▂▂▁▁▂▂▄▂▁▂▁▂▁▂▂▂▁▁▁▁▂▂▁▂▁▁▁▃▅
reward,▄▁▂▇█▇▇▇▇██▇▇█▇█▇█▇██▇▆████▇███▇▇▇▇███▇█

0,1
loss,0.04737
reward,-239.05573


--- Evaluating DQN on Pendulum-v1 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: -150.39
Videos saved in: ./videos/DQN_Pendulum-v1/


Running Experiment with: DDQN

--- Training DDQN on CartPole-v1 ---


  Episode 0/600 | Reward: 46.0
  Episode 50/600 | Reward: 46.0
  Episode 100/600 | Reward: 300.0
  Episode 150/600 | Reward: 500.0
  Episode 200/600 | Reward: 500.0
  Episode 250/600 | Reward: 500.0
  Episode 300/600 | Reward: 500.0
  Episode 350/600 | Reward: 174.0
  Episode 400/600 | Reward: 276.0
  Episode 450/600 | Reward: 500.0
  Episode 500/600 | Reward: 350.0
  Episode 550/600 | Reward: 500.0


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DDQN_CartPole-v1_policy.pth


0,1
loss,▁▁▁▂▂▂▁▁▁▂▅▁▅▅▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██▁▁▇▁
reward,▁▁▁▂▅▃▄▄▅▇███▇██████▇███▂██▂███▂█▇█▄████

0,1
loss,0.00127
reward,500.0


--- Evaluating DDQN on CartPole-v1 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: 500.00
Videos saved in: ./videos/DDQN_CartPole-v1/

--- Training DDQN on Acrobot-v1 ---


  Episode 0/600 | Reward: -500.0
  Episode 50/600 | Reward: -123.0
  Episode 100/600 | Reward: -87.0
  Episode 150/600 | Reward: -95.0
  Episode 200/600 | Reward: -88.0
  Episode 250/600 | Reward: -120.0
  Episode 300/600 | Reward: -86.0
  Episode 350/600 | Reward: -94.0
  Episode 400/600 | Reward: -109.0
  Episode 450/600 | Reward: -95.0
  Episode 500/600 | Reward: -82.0
  Episode 550/600 | Reward: -69.0


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DDQN_Acrobot-v1_policy.pth


0,1
loss,▁▁▁▄▁▁▁▁█▄▄▄▂▄▆▄▄▅▃▃▄▄▃▃▃▃▂▄▅▃▂▂▁▃▂▂▂▂▂▂
reward,▁▁▇▆▇▇▇▇▇█▇▇▇█▇█████▇▇██████████▇██████▇

0,1
loss,0.14734
reward,-69.0


--- Evaluating DDQN on Acrobot-v1 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: -84.83
Videos saved in: ./videos/DDQN_Acrobot-v1/

--- Training DDQN on MountainCar-v0 ---


  Episode 0/2000 | Reward: -200.0
  Episode 50/2000 | Reward: -200.0
  Episode 100/2000 | Reward: -200.0
  Episode 150/2000 | Reward: -200.0
  Episode 200/2000 | Reward: -200.0
  Episode 250/2000 | Reward: -200.0
  Episode 300/2000 | Reward: -200.0
  Episode 350/2000 | Reward: -200.0
  Episode 400/2000 | Reward: -200.0
  Episode 450/2000 | Reward: -200.0
  Episode 500/2000 | Reward: -200.0
  Episode 550/2000 | Reward: -200.0
  Episode 600/2000 | Reward: -200.0
  Episode 650/2000 | Reward: -200.0
  Episode 700/2000 | Reward: -200.0
  Episode 750/2000 | Reward: -200.0
  Episode 800/2000 | Reward: -200.0
  Episode 850/2000 | Reward: -200.0
  Episode 900/2000 | Reward: -200.0
  Episode 950/2000 | Reward: -200.0
  Episode 1000/2000 | Reward: -200.0
  Episode 1050/2000 | Reward: -200.0
  Episode 1100/2000 | Reward: -200.0
  Episode 1150/2000 | Reward: -200.0
  Episode 1200/2000 | Reward: -200.0
  Episode 1250/2000 | Reward: -200.0
  Episode 1300/2000 | Reward: -200.0
  Episode 1350/2000 | Re

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DDQN_MountainCar-v0_policy.pth


0,1
loss,▁▂▂▂▂▁▁▆▁▁▂▁▂▁▂▁▁▁▁▁▃▃█▃▁▂▁▁▁▁▁▂▃▄▁▁▁▁▁▁
reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,0.0
reward,-200.0


--- Evaluating DDQN on MountainCar-v0 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: -200.00
Videos saved in: ./videos/DDQN_MountainCar-v0/

--- Detected continuous action space for Pendulum-v1. Applying discretization wrapper. ---
--- Training DDQN on Pendulum-v1 ---


  Episode 0/600 | Reward: -716.2765916039237
  Episode 50/600 | Reward: -284.4596487894778
  Episode 100/600 | Reward: -130.44523797814017
  Episode 150/600 | Reward: -127.31560662259176
  Episode 200/600 | Reward: -130.4071880663065
  Episode 250/600 | Reward: -251.6848404228919
  Episode 300/600 | Reward: -355.3673396853813
  Episode 350/600 | Reward: -237.33689216975736
  Episode 400/600 | Reward: -131.5397487598941
  Episode 450/600 | Reward: -9.549901865240217
  Episode 500/600 | Reward: -130.31847454729876
  Episode 550/600 | Reward: -251.18768122262867


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


--- Training Complete ---
Model saved to DDQN_Pendulum-v1_policy.pth


0,1
loss,▂███▄▁▁▂▁▁▂▁▁▂▂▁▂▁▃▁▁▂▁▁▂▁▁▁▁▁▂▁▂▁▃▁▁▁▁▁
reward,▁▂▄▃▇██▇▇█▇██▇▇▇▆▇▇▇▇█████▇▇█▇█▇▇███▇█▇▇

0,1
loss,0.10983
reward,-364.56569


--- Evaluating DDQN on Pendulum-v1 for 100 episodes (with video recording) ---


  logger.warn(


Average Reward over 100 episodes: -151.50
Videos saved in: ./videos/DDQN_Pendulum-v1/



--- Overall Evaluation Summary ---

--- DQN Results ---
  Environment: CartPole-v1 | Average Reward (100 eps): 500.00
  Environment: Acrobot-v1 | Average Reward (100 eps): -79.13
  Environment: MountainCar-v0 | Average Reward (100 eps): -182.38
  Environment: Pendulum-v1 | Average Reward (100 eps): -150.39

--- DDQN Results ---
  Environment: CartPole-v1 | Average Reward (100 eps): 500.00
  Environment: Acrobot-v1 | Average Reward (100 eps): -84.83
  Environment: MountainCar-v0 | Average Reward (100 eps): -200.00
  Environment: Pendulum-v1 | Average Reward (100 eps): -151.50
------------------------------------
