In [1]:
import random
from collections import deque
import torch.nn.functional as F
import numpy as np
import time
import gymnasium as gym
import torch

import sys
from pathlib import Path
project_root = Path().resolve().parent  
sys.path.append(str(project_root))
from src.agents.sac import SAC

In [2]:
def train_agent(agent, env, num_episodes=1000, max_step = 500, tau=0.005, policy_noise=0.2, noise_clip=0.5):
    rewards = []
    alosses = []
    c1_losses = []
    c2_losses = []
    steps_taken = []
    time_elapsed = []
    successes = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = []
        step = 0
        episode_aloss = 0
        episode_c1loss = 0
        episode_c2loss = 0
        num_sucess = 0
        
        start = time.time()
        while step <= max_step:
            
            action = agent.get_action(state)

            if isinstance(action, torch.Tensor):
                action = action.detach().cpu().numpy()
            action = np.asarray(action, dtype=np.float32).squeeze()

            next_state, reward, done, truncated, info = env.step(action)

            # Store the transition in the replay buffer
            done_flag = done or truncated
            agent.memory.add([state, action, float(reward), next_state, done_flag])

            aloss, c1loss, c2loss = agent.update(soft = True)
            
            # Track the loss for this step
            if aloss is not None:
                episode_aloss += aloss

            if  c1loss is not None and c2loss is not None:
                episode_c1loss += c1loss
                episode_c2loss += c2loss

            # Update the current state
            state = next_state

            episode_reward.append(reward)
        
            step += 1

            if sum(episode_reward) > 200:
                num_sucess = 1
                break

            if done_flag:
                break

        end = time.time()
        
        
        rewards.append(sum(episode_reward))
        alosses.append(episode_aloss / step)
        c1_losses.append(episode_c1loss / step)
        c2_losses.append(episode_c2loss / step)
        steps_taken.append(step)
        time_elapsed.append((end - start) / 60)
        successes.append(num_sucess)
       
        if (episode % 500 == 0 and episode != 0) or (episode == num_episodes - 1):
            recent_alosses = [x for x in alosses[-500:] if x is not None]
            if recent_alosses:
                avg_aloss = np.mean(recent_alosses)
            else:
                avg_aloss = np.nan
            avg_reward = np.mean(rewards[-500:])
            avg_steps = np.mean(steps_taken[-500:])
            avg_c1loss = np.mean(c1_losses[-500:])
            avg_c2loss = np.mean(c2_losses[-500:])
            num_success = np.sum(successes[-500:])
            print(f"Episode {episode} | Avg Reward (last 500): {avg_reward:.2f} | "
                  f"This Reward: {sum(episode_reward):.2f} | Avg Actor Loss: {avg_aloss:.4f} | "
                  f"Avg Critic 1 Loss: {avg_c1loss:.4f} | Avg Critic 2 Loss: {avg_c2loss:.4f} | "
                  f"Avg Steps: {avg_steps:.0f} | "
                  f"Successes: {num_success} | Time: {time_elapsed[-1]:.2f} min")

    return rewards, alosses, c1_losses, c2_losses, steps_taken, time_elapsed

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training on device:", device)

env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

print(env.observation_space)
print(env.action_space)

agent = SAC(state_dim, action_dim, 256, actor_lr=3e-4, 
                 critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, buffer_size=10000, batch_size=128, 
                 actor_update_freq=2, tau=0.005, max_action=1.0, lr_schedule=None, normalize_obs=False, device=device)

Training on device: cuda
Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Box(-1.0, 1.0, (2,), float32)


In [4]:
import subprocess
from contextlib import contextmanager

@contextmanager
def inhibit_sleep(reason="ML training"):
    proc = subprocess.Popen([
        "systemd-inhibit",
        "--what=idle",
        f"--why={reason}",
        "sleep", "infinity"
    ])
    try:
        yield
    finally:
        proc.terminate()
        proc.wait()

with inhibit_sleep("ML training"):
    rewards, alosses, c1_losses, c2_losses, steps_taken, time_elapsed = train_agent(agent, env, num_episodes=10000, max_step = 500, tau=0.01, policy_noise=0.2, noise_clip=0.5)


Episode 500 | Avg Reward (last 500): 32.12 | This Reward: 216.70 | Avg Actor Loss: -2.7279 | Avg Critic 1 Loss: 6.4180 | Avg Critic 2 Loss: 6.4164 | Avg Steps: 475 | Successes: 41 | Time: 0.02 min
Episode 1000 | Avg Reward (last 500): 197.71 | This Reward: 256.57 | Avg Actor Loss: -13.9880 | Avg Critic 1 Loss: 13.7858 | Avg Critic 2 Loss: 13.7785 | Avg Steps: 392 | Successes: 342 | Time: 0.01 min
Episode 1500 | Avg Reward (last 500): 229.53 | This Reward: 282.98 | Avg Actor Loss: -26.7104 | Avg Critic 1 Loss: 16.7708 | Avg Critic 2 Loss: 16.8072 | Avg Steps: 272 | Successes: 414 | Time: 0.01 min
Episode 2000 | Avg Reward (last 500): 254.90 | This Reward: 278.53 | Avg Actor Loss: -39.6248 | Avg Critic 1 Loss: 12.1948 | Avg Critic 2 Loss: 12.1865 | Avg Steps: 203 | Successes: 477 | Time: 0.01 min
Episode 2500 | Avg Reward (last 500): 253.13 | This Reward: 273.02 | Avg Actor Loss: -40.3293 | Avg Critic 1 Loss: 12.5298 | Avg Critic 2 Loss: 12.5378 | Avg Steps: 205 | Successes: 472 | Time: 

KeyboardInterrupt: 