In [1]:
import torch

In [11]:
import os
import random
import time
import gymnasium as gym
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from stable_baselines3.common.buffers import ReplayBuffer
import wandb
from huggingface_hub import HfApi, upload_folder


In [10]:

# ===== CONFIGURATION =====
class Config:
    # Experiment settings
    exp_name = "DQN-CartPole"
    seed = 42
    env_id = "CartPole-v1"
    
    # Training parameters
    total_timesteps = 10000
    learning_rate = 2.5e-4
    buffer_size = 10000
    gamma = 0.99
    tau = 1.0
    target_network_frequency = 500
    batch_size = 128
    start_e = 1.0
    end_e = 0.05
    exploration_fraction = 0.5
    learning_starts = 1000
    train_frequency = 10
    
    # Logging & saving
    capture_video = True
    save_model = True
    upload_model = True
    hf_entity = ""  # Your Hugging Face username
    
    # WandB settings
    use_wandb = True
    wandb_project = "cleanRL"
    wandb_entity = ""  # Your WandB username/team


In [7]:
class QNet(nn.Module):
    def __init__(self, state_space, action_space):
        super(QNet, self).__init__()
        self.fc1 = nn.Linear(state_space, 256)
        self.fc2 = nn.Linear(256, 512)
        self.q_value = nn.Linear(512, action_space)
    def forward(self, x):
        return self.q_value(torch.relu(self.fc2(torch.relu(self.fc1(x)))))
    

In [8]:
class LinearEpsilonDecay(nn.Module):
    def __init__(self, initial_eps, end_eps,total_timesteps):
        super(LinearEpsilonDecay, self).__init__()
        self.initial_eps = initial_eps
        # self.decay_factor = decay_factor
        self.total_timesteps = total_timesteps
        self.end_eps = end_eps
        
        
    def forward(self, x, current_timestep):
        slope = (self.end_eps - self.initial_eps) / (exploration_fraction * self.total_timesteps)
        return max(slope * current_timestep + self.initial_eps, self.end_eps)


In [9]:

def make_env(env_id, seed, capture_video, run_name, eval_mode=False):
    """Create environment with video recording"""
    env = gym.make(env_id, render_mode="rgb_array")
    env = gym.wrappers.RecordEpisodeStatistics(env)
    
    # Video recording setup
    if capture_video:
        if eval_mode:
            # Evaluation videos
            video_prefix = f"videos/{run_name}/eval"
        else:
            # Training videos
            video_prefix = f"videos/{run_name}/train"
            env = gym.wrappers.RecordVideo(
                env, 
                video_prefix,
                episode_trigger=lambda x: x % 100 == 0  # Record every 100 episodes
            )
    
    env.action_space.seed(seed)
    return env

In [None]:
def evaluate(model, env, device, run_name, eps_decay, num_eval_eps = 10):
    eval_env = make_env(env_id=Config.env_id, seed=Config.seed, capture_video=True, run_name=run_name, eval_mode=True)
    eval_env.action_space.seed(Config.seed)
    
    model = model.to(device)
    model = model.eval()
    
    
    returns = []
    frames = []
    
    for eps in tqdm(range(num_eval_eps)):
        obs, _ = eval_env.reset()
        done = False
        rewards = 0.0
        
        while not done:
            state_space = torch.tensor(obs, dtype=torch.float32).to(device)
            q_val = model(state_space.unsqueeze(0))
            q_new = 

In [12]:
args = Config()
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"

 # Initialize WandB
if args.use_wandb:
        wandb.init(
            project=args.wandb_project,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
os.makedirs(f"videos/{run_name}/train", exist_ok=True)
os.makedirs(f"videos/{run_name}/eval", exist_ok=True)
os.makedirs(f"runs/{run_name}", exist_ok=True)
writer = SummaryWriter(f"runs/{run_name}")
    
    # Set seeds
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrajceo2031[0m ([33mrentio[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
env = make_env(args.env_id, args.seed, args.capture_video, run_name)

In [None]:
q_net = Q