In [1]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, RecordEpisodeStatistics
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque, namedtuple
import random
import wandb
import os
import itertools
from copy import deepcopy

GLOBAL_SEED = 100 # Use 42 for consistency, or any arbitrary fixed number
def set_seeds(seed_value):
    """Sets a fixed seed for reproducibility across all random components."""
    torch.manual_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seeds(GLOBAL_SEED)

In [2]:
# Define a Transition named tuple for Experience Replay
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """A fixed-size buffer to store experience tuples."""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Retrieve a random batch of transitions."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [3]:
class DQN_Model(nn.Module):
    """
    Deep Q-Network Model: Takes state as input, outputs Q-values for actions.
    This model will be used for both the Online and Target Networks.
    """
    def __init__(self, state_size, action_size):
        super(DQN_Model, self).__init__()
        # Using a simple 3-layer fully connected network (suitable for CartPole/Acrobot)
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


In [4]:
class DQNAgent:
    """
    The main agent class that implements DQN and DDQN logic,
    including experience collection and the core training step.
    """
    def __init__(self, state_size, action_size, config):
        self.state_size = state_size
        self.action_size = action_size
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Hyperparameters (from Wandb config)
        self.GAMMA = config.gamma           # Discount Factor
        self.LR = config.learning_rate      # NN Learning Rate
        self.EPSILON_START = config.epsilon_start
        self.EPSILON_END = config.epsilon_end
        self.EPSILON_DECAY = config.epsilon_decay # Epsilon Decay Rate
        self.BATCH_SIZE = config.batch_size # Learning Batch Size
        self.DDQN = config.model == 'DDQN'  # Flag to switch between DQN/DDQN
        self.TARGET_UPDATE = config.target_update_freq
        self.GLOBAL_SEED = config.seed
        self.step_count = 0

        # Initialize Networks
        self.policy_net = DQN_Model(state_size, action_size).to(self.device)
        self.target_net = DQN_Model(state_size, action_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval() # Target network is not trained

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LR)
        self.memory = ReplayBuffer(config.memory_size)
        self.loss_fn = nn.MSELoss() # Or nn.SmoothL1Loss (Huber Loss)

    def select_action(self, state, evaluate=False):
        """Selects an action using the epsilon-greedy policy."""
        # Calculate current epsilon based on decay rate
        epsilon = self.EPSILON_END + (self.EPSILON_START - self.EPSILON_END) * \
                  np.exp(-self.step_count * self.EPSILON_DECAY)

        # In evaluation, always exploit (greedy)
        if evaluate or random.random() > epsilon:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
                # Select action with max Q-value
                action = self.policy_net(state_tensor).argmax(1).item()
        else:
            # Explore: select a random action
            action = random.randrange(self.action_size)

        return action, epsilon

    def update_target_net(self):
        """Update the target network by copying weights from the policy network."""
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def optimize_model(self):
        """Performs a single step of optimization on the Policy Network."""
        if len(self.memory) < self.BATCH_SIZE:
            return 0.0 # Not enough samples yet

        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (Turn an array of Transitions into a Transition of arrays)
        batch = Transition(*zip(*transitions))

        # Convert state/action/reward arrays to PyTorch tensors
        state_batch = torch.stack(list(batch.state)).to(self.device)
        action_batch = torch.tensor(batch.action, dtype=torch.int64).unsqueeze(1).to(self.device)
        reward_batch = torch.tensor(batch.reward, dtype=torch.float32).unsqueeze(1).to(self.device)
        next_state_batch = torch.stack(list(batch.next_state)).to(self.device)
        done_batch = torch.tensor(batch.done, dtype=torch.float32).unsqueeze(1).to(self.device)

        # 1. Compute Q(s_t, a) - the Q-value from the policy network for the action taken
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # 2. Compute V(s_{t+1}) = max_a Q(s_{t+1}, a) - the target value

        # CORE DIFFERENCE between DQN and DDQN lies here:
        if self.DDQN:
            # DDQN: Select action a' using the ONLINE network, then evaluate a' using the TARGET network.
            # a'_max = argmax_a Q_online(s', a)
            # V(s') = Q_target(s', a'_max)
            with torch.no_grad():
                # Get the action a' that maximizes Q in the next state, from the POLICY (online) net
                next_state_actions = self.policy_net(next_state_batch).argmax(1).unsqueeze(1)

            # Compute the Q-value for that selected action a' from the TARGET net
            next_state_values = self.target_net(next_state_batch).gather(1, next_state_actions)
        else:
            # DQN: Select and evaluate the best next action using only the TARGET network.
            # V(s') = max_a Q_target(s', a)
            with torch.no_grad():
                # Compute the max Q-value for the next state from the TARGET net
                next_state_values = self.target_net(next_state_batch).max(1)[0].unsqueeze(1)

        # Handle terminal states: max Q is 0 if the episode is done
        next_state_values = next_state_values * (1 - done_batch)

        # Compute the Target Q-Value: Y_t = r_t + gamma * V(s_{t+1})
        expected_state_action_values = reward_batch + (self.GAMMA * next_state_values)

        # Compute Loss
        loss = self.loss_fn(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        # Clip gradients to prevent large updates
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 1.0)
        self.optimizer.step()

        # Target network update
        if self.step_count % self.TARGET_UPDATE == 0:
            self.update_target_net()

        return loss.item()


In [5]:
def train_agent(env, agent, num_episodes, env_name):
    """The main training loop."""
    print(f"\n--- Starting Training for {agent.config.model} on {env_name} ---")

    for i_episode in range(1, num_episodes + 1):
        seed_arg = agent.GLOBAL_SEED if i_episode == 1 else None
        state, info = env.reset(seed=seed_arg)
        state = torch.tensor(state, dtype=torch.float32, device=agent.device).unsqueeze(0)
        episode_reward = 0
        loss = 0

        for t in itertools.count():
            # Select action
            action, epsilon = agent.select_action(state.squeeze(0).cpu().numpy())

            # Execute action
            next_state_np, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            # Convert to tensors
            next_state = torch.tensor(next_state_np, dtype=torch.float32, device=agent.device).unsqueeze(0)
            reward = torch.tensor([reward], dtype=torch.float32)

            # Store the transition in the Replay Buffer
            agent.memory.push(state.squeeze(0), action, reward, next_state.squeeze(0), done)

            # Move to the next state
            state = next_state
            episode_reward += reward.item()
            agent.step_count += 1

            # Perform one optimization step
            current_loss = agent.optimize_model()
            loss += current_loss

            if done:
                break

        # Log episode results to Wandb
        wandb.log({
            "episode": i_episode,
            "episode_reward": episode_reward,
            "avg_step_loss": loss / t if t > 0 else 0,
            "epsilon": epsilon,
            "episode_length": t,
        })

        if i_episode % 100 == 0:
            print(f"Episode: {i_episode}/{num_episodes} | Reward: {episode_reward:.2f} | Epsilon: {epsilon:.4f}")

    print(f"--- Training finished for {agent.config.model} ---")



In [6]:
def evaluate_agent(env_name, agent, num_tests=100, record_video=False):
    """Evaluates the trained agent for a number of episodes and logs duration."""
    print(f"\n--- Starting Evaluation for {agent.config.model} on {env_name} ({num_tests} tests) ---")

    # Determine render mode
    render_mode = "rgb_array" if record_video else None

    # Create evaluation environment (must use render_mode='rgb_array' for video)
    eval_env = gym.make(env_name, render_mode=render_mode)
    # Wrap for collecting episode statistics
    # The 'deque_size' argument is deprecated/removed in newer Gymnasium versions.
    eval_env = RecordEpisodeStatistics(eval_env)

    # Wrap for video recording (if requested)
    if record_video:
        video_folder = f"./videos/{env_name}_{agent.config.model}"
        # Only record the first test episode
        eval_env = RecordVideo(
            eval_env,
            video_folder=video_folder,
            episode_trigger=lambda x: x == 0,
            name_prefix=f"best_agent"
        )
        print(f"Recording the first episode to: {video_folder}")

    # Run tests
    test_durations = []
    test_rewards = []

    for i in range(num_tests):
        state, info = eval_env.reset()
        done = False

        while not done:
            action, _ = agent.select_action(state, evaluate=True)
            state, reward, terminated, truncated, info = eval_env.step(action)
            done = terminated or truncated

            # RecordEpisodeStatistics wrapper adds episode stats to info on done=True
            if done:
                if 'episode' in info:
                    duration = info['episode']['l']
                    reward = info['episode']['r']

                    # Ensure duration and reward are floats
                    if isinstance(duration, int):
                        duration = float(duration)
                    if isinstance(reward, int):
                        reward = float(reward)

                    test_durations.append(duration)
                    test_rewards.append(reward)

                    # Log individual test result
                    wandb.log({
                        f"{env_name}/Test_Episode_Duration": duration,
                        f"{env_name}/Test_Episode_Reward": reward,
                        "test_episode_index": i
                    })
                break

    eval_env.close()

    if test_durations:
        avg_duration = np.mean(test_durations)
        std_duration = np.std(test_durations)
        avg_reward = np.mean(test_rewards)

        wandb.log({
            f"{env_name}/Avg_Test_Duration": avg_duration,
            f"{env_name}/Std_Test_Duration": std_duration,
            f"{env_name}/Avg_Test_Reward": avg_reward,
        })

        print(f"Evaluation complete. Avg Duration: {avg_duration:.2f} ± {std_duration:.2f} steps.")
        print(f"Avg Reward: {avg_reward:.2f}")

    return avg_reward, avg_duration

In [7]:
def main_run(config):
    """Initializes Wandb, environment, agent, trains, and evaluates."""
    # 1. Initialize Wandb Run
    run = wandb.init(
        project="CartPole-v1-problem-seed-right-value-v8",
        name=f"{config['model']}_DF{config['gamma']}_EDR{config['epsilon_decay']}_NNLR{config['learning_rate']}_MEM{config['memory_size']}_BS{config['batch_size']}",
        config=config
    )

    # 2. Setup Environment
    # Note: Pendulum-v1 and MountainCar-v0 have Continuous Action Spaces.
    # We discretize them for Q-Learning compatibility.

    env_name = config['env_name']

    if env_name in ["Pendulum-v1", "MountainCar-v0"]:
        # Discretize continuous environments for Q-Learning
        if env_name == "Pendulum-v1":
             # Actions: 5 discrete actions: max_torque * [-2.0, -1.0, 0.0, 1.0, 2.0]
             env = gym.make(env_name, max_episode_steps=200) # Default
             class ContinuousActionWrapper(gym.ActionWrapper):
                 def __init__(self, env):
                     super().__init__(env)
                     self.action_range = [-2.0, -1.0, 0.0, 1.0, 2.0]
                     self.action_space = gym.spaces.Discrete(len(self.action_range))
                 def action(self, action_idx):
                     # Map the discrete index to the continuous action value
                     return np.array([self.action_range[action_idx]], dtype=np.float32)
             env = ContinuousActionWrapper(env)

        elif env_name == "MountainCar-v0":
            # Actions: 3 discrete actions: 0:push_left, 1:no_push, 2:push_right
            env = gym.make(env_name, max_episode_steps=200) # Default

    elif env_name in ["CartPole-v1", "Acrobot-v1"]:
        env = gym.make(env_name)
    else:
        raise ValueError(f"Environment {env_name} not supported by this script.")

    # Get environment specs
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # 3. Create Agent
    agent = DQNAgent(state_size, action_size, wandb.config)

    # 4. Train
    #Set episodes based on difficulty (you can adjust these)
    if env_name == "CartPole-v1":
        num_episodes = 500
    elif env_name == "Acrobot-v1":
        num_episodes = 1000
    elif env_name == "MountainCar-v0":
        num_episodes = 2000
    elif env_name == "Pendulum-v1":
        num_episodes = 1000

    train_agent(env, agent, 200, env_name)
    env.close()

    # 5. Evaluate (100 tests)
    # evaluate_agent(env_name, agent, num_tests=100, record_video=False)

    # 6. Record the Best Agent (Separate run for the final deliverable)
    # The record_video=True will only record the FIRST episode of the 100 tests.
    # Run this function separately after identifying your best hyperparameter setup.
    evaluate_agent(env_name, agent, num_tests=100, record_video=True)

    run.finish()

In [None]:
BASELINE_CONFIG = {
    "env_name": "CartPole-v1",       # Environment to run (change this for other envs)
    "model": "DQN",                 # Can be 'DQN' or 'DDQN'
    "num_episodes": 500,            # Training episodes (overwritten in main_run)
    "gamma": 0.99,                  # Discount Factor
    "learning_rate": 2e-4,          # NN Learning Rate
    "epsilon_start": 1.0,           # Start exploration rate
    "epsilon_end": 0.01,            # Minimum exploration rate
    "epsilon_decay": 0.001,         # Epsilon Decay Rate (adjust to control exploration speed)
    "memory_size": 50000,           # Replay Memory Size
    "batch_size": 64,               # Learning Batch Size
    "target_update_freq": 200,      # Target network update frequency (in steps)
    "seed": 100,
}

MOUNTAINCAR_BASELINE_CONFIG = {
        "env_name": "MountainCar-v0",
        "model": "DQN",
        "num_episodes": 2000,          
        "gamma": 0.999,                
        "learning_rate": 2e-4,         
        "epsilon_start": 1.0,
        "epsilon_end": 0.01,
        "epsilon_decay": 0.0005,        
        "memory_size": 50000,
        "batch_size": 32,
        "target_update_freq": 200,
        "seed": 100,
    }


In [9]:
if __name__ == '__main__':
    # Ensure videos folder exists
    os.makedirs("./videos", exist_ok=True)

    # -------------------------------------------------------------------------
    # STEP 1: INITIAL RUN (DQN, CartPole-v1)
    # This will establish a baseline and verify your code works.
    # -------------------------------------------------------------------------
    print("Executing Baseline Run: DQN on CartPole-v1. Check your Wandb dashboard.")
    config_baseline = deepcopy(BASELINE_CONFIG)
    config_baseline['env_name'] = 'CartPole-v1'
    config_baseline['model'] = 'DQN'
    main_run(config_baseline)

    # print("Executing Baseline Run: DQN on CartPole-v1. Check your Wandb dashboard.")
    # config_baseline = deepcopy(BASELINE_CONFIG)
    # config_baseline['env_name'] = 'CartPole-v1'
    # config_baseline['model'] = 'DDQN'
    # main_run(config_baseline)
    #----------------------------------cartpole-----------------
    # # R1: DQN Baseline (Control Group)
    # config_r1 = deepcopy(BASELINE_CONFIG)
    # config_r1['name'] = 'R1_DQN_BASELINE'

    # # R2: DDQN Baseline (Model Comparison)
    # config_r2 = deepcopy(BASELINE_CONFIG)
    # config_r2['model'] = 'DDQN'
    # config_r2['name'] = 'R2_DDQN_COMPARISON'
    
    # # --- DISCOUNT FACTOR (GAMMA) ---
    # # R3: Gamma Too Low (Myopic Agent)
    # config_r3 = deepcopy(BASELINE_CONFIG)
    # config_r3['gamma'] = 0.80
    # config_r3['name'] = 'R3_GAMMA_LOW_0.80'
    
    # # R4: Gamma Too High (Farsighted/Unstable Agent)
    # config_r4 = deepcopy(BASELINE_CONFIG)
    # config_r4['gamma'] = 0.999
    # config_r4['name'] = 'R4_GAMMA_HIGH_0.999'

    # # --- NN LEARNING RATE (LR) ---
    # # R5: LR Too High (Divergence/Oscillation)
    # config_r5 = deepcopy(BASELINE_CONFIG)
    # config_r5['learning_rate'] = 0.01  # 20x higher than baseline
    # config_r5['name'] = 'R5_LR_HIGH_0.01'

    # # R6: LR Too Low (Slow Convergence)
    # config_r6 = deepcopy(BASELINE_CONFIG)
    # config_r6['learning_rate'] = 1e-5
    # config_r6['name'] = 'R6_LR_LOW_1e-5'

    # # --- EPSILON DECAY RATE (ALPHA) ---
    # # R7: Decay Too Slow (Persistent Exploration) - 10x slower
    # config_r7 = deepcopy(BASELINE_CONFIG)
    # config_r7['epsilon_decay'] = 0.0005
    # config_r7['name'] = 'R7_DECAY_SLOW_0.0005'
    
    # # R8: Decay Too Fast (Premature Exploitation) - 10x faster
    # config_r8 = deepcopy(BASELINE_CONFIG)
    # config_r8['epsilon_decay'] = 0.05
    # config_r8['name'] = 'R8_DECAY_FAST_0.05'

    # # --- MEMORY SIZE & BATCH SIZE ---
    # # R9: Small Replay Memory (High Correlation)
    # config_r9 = deepcopy(BASELINE_CONFIG)
    # config_r9['memory_size'] = 5000
    # config_r9['name'] = 'R9_MEM_SMALL_5k'

    # # R10: Small Batch Size (Noisy Gradients)
    # config_r10 = deepcopy(BASELINE_CONFIG)
    # config_r10['batch_size'] = 16
    # config_r10['name'] = 'R10_BATCH_SMALL_16'


    # # --- EXECUTION LOOP ---
    # experiment_configs = [
    #     config_r1, config_r2, config_r3, config_r4, config_r5, 
    #     config_r6, config_r7, config_r8, config_r9, config_r10
    # ]

    # for i, config in enumerate(experiment_configs):
    #     print(f"\n========================================================")
    #     print(f"Starting Experiment {i+1}/{len(experiment_configs)}: {config['name']}")
    #     print(f"========================================================")
        
    #     # Log the specific config name to Wandb for easy identification
    #     config_to_run = deepcopy(config)
        
    #     # NOTE: If you are running this in a notebook, you may need to restart
    #     # the kernel between runs to ensure Wandb is initialized correctly.
        
    #     # Run the experiment
    #     main_run(config_to_run)
        
    # print("\n\nALL 10 CARTPOLE EXPERIMENTS COMPLETE. CHECK WANDB FOR RESULTS.")
    
    # # R1: DQN Baseline (Control Group - Optimized for Sparse Reward)
    # config_r1 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r1['name'] = 'MC_R1_DQN_BASELINE_OPTM'

    # # R2: DDQN Baseline (Model Comparison)
    # config_r2 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r2['model'] = 'DDQN'
    # config_r2['name'] = 'MC_R2_DDQN_COMPARISON'
    
    # # --- DISCOUNT FACTOR (GAMMA) ---
    # # R3: Gamma Too Low (Myopic Agent - Expected to Fail)
    # config_r3 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r3['gamma'] = 0.95  # Significantly lower than 0.999
    # config_r3['name'] = 'MC_R3_GAMMA_LOW_0.95'
    
    # # R4: Gamma Very High (Testing Edge Case)
    # config_r4 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r4['gamma'] = 1.0 # Perfect Discount
    # config_r4['name'] = 'MC_R4_GAMMA_PERFECT_1.0'

    # # --- NN LEARNING RATE (LR) ---
    # # R5: LR Too High (Divergence/Oscillation)
    # config_r5 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r5['learning_rate'] = 0.01
    # config_r5['name'] = 'MC_R5_LR_HIGH_0.01'

    # # R6: LR Too Low (Slow Convergence)
    # config_r6 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r6['learning_rate'] = 1e-5
    # config_r6['name'] = 'MC_R6_LR_LOW_1e-5'

    # # --- EPSILON DECAY RATE (ALPHA) ---
    # # R7: Decay Too Slow (Persistent Exploration - Baseline is already slow, make it ultra-slow)
    # config_r7 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r7['epsilon_decay'] = 0.0001
    # config_r7['name'] = 'MC_R7_DECAY_ULTRA_SLOW_0.0001'
    
    # # R8: Decay Too Fast (Premature Exploitation - Expected to Fail)
    # config_r8 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r8['epsilon_decay'] = 0.01 # 10x faster than baseline
    # config_r8['name'] = 'MC_R8_DECAY_FAST_0.01'

    # # --- MEMORY SIZE & BATCH SIZE ---
    # # R9: Small Replay Memory (High Correlation)
    # config_r9 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r9['memory_size'] = 5000
    # config_r9['name'] = 'MC_R9_MEM_SMALL_5k'

    # # R10: Large Batch Size (Smoother Gradients, but may hinder exploration on sparse rewards)
    # config_r10 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    # config_r10['batch_size'] = 128
    # config_r10['name'] = 'MC_R10_BATCH_LARGE_128'


    # # --- EXECUTION LOOP ---
    # experiment_configs = [
    #     config_r1, config_r2, config_r3, config_r4, config_r5, 
    #     config_r6, config_r7, config_r8, config_r9, config_r10
    # ]

    # for i, config in enumerate(experiment_configs):
    #     print(f"\n========================================================")
    #     print(f"Starting Experiment {i+1}/{len(experiment_configs)}: {config['name']}")
    #     print(f"========================================================")
        
    #     # Log the specific config name to Wandb for easy identification
    #     config_to_run = deepcopy(config)
        
    #     # NOTE: If you are running this in a notebook, you may need to restart
    #     # the kernel between runs to ensure Wandb is initialized correctly.
        
    #     # Run the experiment
    #     main_run(config_to_run)
        
    # print("\n\nALL 10 MOUNTAINCAR EXPERIMENTS COMPLETE. CHECK WANDB FOR RESULTS.")
    
    # -------------------------------------------------------------------------
    # STEP 2: HYPERPARAMETER SEARCH AND DDQN COMPARISON (Step 8)
    # Uncomment and modify these blocks to run your full experiment matrix.
    # Use different parameter values for each run to test their effect!
    # -------------------------------------------------------------------------

    # # Example DDQN run for comparison
    # config_ddqn = deepcopy(BASELINE_CONFIG)
    # config_ddqn['model'] = 'DDQN'
    # # main_run(config_ddqn)

    # # Example DQN run with high Learning Rate (to test LR effect)
    # config_lr_high = deepcopy(BASELINE_CONFIG)
    # config_lr_high['learning_rate'] = 0.01
    # # main_run(config_lr_high)

    # # Example DDQN run on Acrobot-v1
    # config_acrobot = deepcopy(BASELINE_CONFIG)
    # config_acrobot['env_name'] = 'Acrobot-v1'
    # config_acrobot['model'] = 'DDQN'
    # config_acrobot['g
    # 
    # 
    # 
    # 995 # Acrobot needs a higher gamma
    # # main_run(config_acrobot)

    # # Example DDQN run on MountainCar-v0
    # config_mountaincar = deepcopy(BASELINE_CONFIG)
    # config_mountaincar['env_name'] = 'MountainCar-v0'
    # config_mountaincar['model'] = 'DDQN'
    # config_mountaincar['gamma'] = 1.0 # Requires high/perfect discount due to sparse reward
    # config_mountaincar['epsilon_decay'] = 0.001 # Slower exploration decay
    # # main_run(config_mountaincar)

    # -------------------------------------------------------------------------
    # STEP 3: FINAL VIDEO RECORDING (Step 9)
    # Once you find your best performing model and hyperparameters, run the
    # evaluation *only* to generate the video, then stop this script.
    # Replace the configuration below with your best setup.
    # -------------------------------------------------------------------------

    # IMPORTANT: The video generation function is not run by default.
    # To record your final video, manually run the following after finding
    # your best agent/config:

    # BEST_CONFIG = deepcopy(config_acrobot) # Example: use your best Acrobot config
    # FINAL_ENV = 'Acrobot-v1'
    #
    # final_env = gym.make(FINAL_ENV)
    # if FINAL_ENV == 'Pendulum-v1': # Re-apply wrapper if needed
    #      class ContinuousActionWrapper(gym.ActionWrapper):
    #          def __init__(self, env):
    #              super().__init__(env)
    #              self.action_range = [-2.0, -1.0, 0.0, 1.0, 2.0]
    #              self.action_space = gym.spaces.Discrete(len(self.action_range))
    #          def action(self, action_idx):
    #              return np.array([self.action_range[action_idx]], dtype=np.float32)
    #      final_env = ContinuousActionWrapper(final_env)
    #
    # final_agent = DQNAgent(final_env.observation_space.shape[0], final_env.action_space.n, BEST_CONFIG)
    # # NOTE: You would typically load the trained weights here,
    # # but for this self-contained script, you might re-run the training
    # # and then immediately call the evaluation with record_video=True
    #
    # # Temporarily re-run the best training once more
    # print(f"Recording final video for {FINAL_ENV} with best DDQN agent...")
    # # train_agent(final_env, final_agent, 1000, FINAL_ENV) # If you need to re-train
    # # evaluate_agent(FINAL_ENV, final_agent, num_tests=1, record_video=True)
    # final_env.close()

Executing Baseline Run: DQN on CartPole-v1. Check your Wandb dashboard.


wandb: Currently logged in as: amira-elgarf02 (amira-elgarf02-cairo-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin



--- Starting Training for DQN on CartPole-v1 ---
Episode: 100/200 | Reward: 317.00 | Epsilon: 0.0100
Episode: 200/200 | Reward: 358.00 | Epsilon: 0.0100
--- Training finished for DQN ---

--- Starting Evaluation for DQN on CartPole-v1 (100 tests) ---


  logger.warn(


Recording the first episode to: ./videos/CartPole-v1_DQN


  from pkg_resources import resource_stream, resource_exists


Evaluation complete. Avg Duration: 352.59 ± 75.48 steps.
Avg Reward: 352.59


0,1
CartPole-v1/Avg_Test_Duration,▁
CartPole-v1/Avg_Test_Reward,▁
CartPole-v1/Std_Test_Duration,▁
CartPole-v1/Test_Episode_Duration,▃▇▂█▂▂▇▃▄▂█▂▄▄▄▅▄▁▁▅▃▃▃█▂▅▄▃▆▂█▃▄▆▁▃▄▇▂▃
CartPole-v1/Test_Episode_Reward,▂▃▅▇▁▆▂▇▃▃▅█▅▄▄▄▄▅▅▄▃▂▃█▄█▄▃▆▂▆▂▃▃▁▃█▇█▃
avg_step_loss,▂▂▁▁▁▂▁▁▁▁▂▁▂▂▂▂▂▃▃▃▃▄▄▅▅▆▅▅▆▅▇█▆▇▆▆▆▅▅▇
episode,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██████
episode_length,▂▁▁▁▁▁▁▁▁▁▁▁▂▅▇▇▆▅▇▇▅▆▅▅▇▆▆▅▅▆▆▇▆▇▆▅▇▆▆█
episode_reward,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂▅▆▄▅▅▄▇▆▆▆▆▄▄▄▅▅▅▄▄▆▄▅▅█▆
epsilon,█▇▇▆▆▅▅▅▄▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
CartPole-v1/Avg_Test_Duration,352.59
CartPole-v1/Avg_Test_Reward,352.59
CartPole-v1/Std_Test_Duration,75.48021
CartPole-v1/Test_Episode_Duration,315
CartPole-v1/Test_Episode_Reward,315
avg_step_loss,3.83539
episode,200
episode_length,357
episode_reward,358
epsilon,0.01
