In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")

True
1
NVIDIA GeForce RTX 4060 Laptop GPU


In [1]:
import torch
import torch.nn as nn
from torch.distributions import Categorical

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []
        self.is_terminals = []

    def clear(self):
        self.actions.clear()
        self.states.clear()
        self.logprobs.clear()
        self.rewards.clear()
        self.state_values.clear()
        self.is_terminals.clear()

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def act(self, state, deterministic=False):
        action_probs = self.actor(state)

        if deterministic:
            action = torch.argmax(action_probs, dim=-1)
            action_logprob = torch.log(action_probs.gather(1, action.unsqueeze(-1)).squeeze(-1))
        else:
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()
            action_logprob = dist.log_prob(action)

        state_value = self.critic(state)
        return action, action_logprob, state_value


    def evaluate(self, states, actions):
        action_probs = self.actor(states)
        dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(actions)
        dist_entropy = dist.entropy()
        state_values = self.critic(states).squeeze(-1)
        return action_logprobs, state_values, dist_entropy

class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])

        self.policy_old = ActorCritic(state_dim, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()

    def select_action(self, state, deterministic=False):
        state = torch.FloatTensor(state).flatten().unsqueeze(0).to(device)

        with torch.no_grad():
            action, action_logprob, state_value = self.policy_old.act(state, deterministic=deterministic)

        if not deterministic:
            self.buffer.states.append(state.squeeze(0))
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            self.buffer.state_values.append(state_value)

        return action.item()


    def update(self):
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            discounted_reward = reward + (self.gamma * discounted_reward * (1 - is_terminal))
            rewards.insert(0, discounted_reward)

        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        old_states = torch.stack(self.buffer.states).to(device)
        old_actions = torch.stack(self.buffer.actions).to(device)
        old_logprobs = torch.stack(self.buffer.logprobs).to(device)
        old_state_values = torch.stack(self.buffer.state_values).to(device)

        advantages = rewards - old_state_values.detach()

        for _ in range(self.K_epochs):
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())
        self.buffer.clear()


Using device: cuda


In [2]:
import gymnasium as gym
import highway_env
import numpy as np
import matplotlib.pyplot as plt

BASE_CONFIG = {
    "observation":       {"type": "Kinematics"},
    "action":            {"type": "DiscreteMetaAction"},
    "lanes_count":       4,
    "vehicles_count":    50,
    "controlled_vehicles": 1,
    "duration":          40,
    "ego_spacing":       2,
    "vehicles_density":  1,
    "collision_reward":  -1,
    "right_lane_reward": 0.1,
    "high_speed_reward": 0.4,
    "lane_change_reward": 0,
    "normalize_reward":  True,
    "offroad_terminal":  False,
}

In [None]:
def train():
    env_name = "highway-v0"
    env = gym.make(env_name, render_mode="rgb_array").unwrapped
    env.configure(BASE_CONFIG)
    print(env.observation_space)

    state_dim = np.prod(env.observation_space.shape)
    action_dim = env.action_space.n

    max_episodes = 1000
    max_ep_len = 1000
    update_timestep = 4000
    K_epochs = 80
    eps_clip = 0.2
    gamma = 0.99
    lr_actor = 0.0003
    lr_critic = 0.001

    episode_reward_history = []
    running_rewards = []
    last_n_reward = 50 
    
    
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip)
    time_step = 0
    i_episode = 0
    

    while i_episode < max_episodes:
        obs, _ = env.reset()
        current_ep_reward = 0

        for _ in range(max_ep_len):
            action = ppo_agent.select_action(obs)
            next_obs, reward, done, truncated, info = env.step(action)

            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done or truncated)

            time_step += 1
            current_ep_reward += reward

            if time_step % update_timestep == 0:
                ppo_agent.update()

            obs = next_obs

            if done or truncated:
                break

        i_episode += 1
        episode_reward_history.append(current_ep_reward)

        # Compute running average
        if len(episode_reward_history) > last_n_reward:
            running_reward = np.mean(episode_reward_history[-last_n_reward:])
        else:
            running_reward = np.mean(episode_reward_history)
        running_rewards.append(running_reward)
        if i_episode % 10 == 0:
            print(f"Episode {i_episode}, Reward: {current_ep_reward:.2f}, Running Avg ({last_n_reward}): {running_reward:.2f}")
    
    plt.figure(figsize=(10, 5))
    plt.plot(episode_reward_history, label="Episode Reward")
    plt.plot(running_rewards, label=f"Running Avg (last {last_n_reward})", linewidth=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("PPO Training Rewards")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    env.close()

if __name__ == "__main__":
    train()

Box(-inf, inf, (5, 5), float32)
Episode 10, Reward: 7.19, Running Avg (50): 8.32
Episode 20, Reward: 2.62, Running Avg (50): 8.76
Episode 30, Reward: 29.95, Running Avg (50): 10.92
Episode 40, Reward: 3.26, Running Avg (50): 10.65
Episode 50, Reward: 3.18, Running Avg (50): 11.05
Episode 60, Reward: 1.69, Running Avg (50): 10.85
Episode 70, Reward: 3.91, Running Avg (50): 10.91
Episode 80, Reward: 15.37, Running Avg (50): 10.10
Episode 90, Reward: 8.26, Running Avg (50): 10.21
Episode 100, Reward: 16.49, Running Avg (50): 10.34
Episode 110, Reward: 6.26, Running Avg (50): 11.04
Episode 120, Reward: 30.21, Running Avg (50): 10.92
Episode 130, Reward: 18.12, Running Avg (50): 10.10
Episode 140, Reward: 2.71, Running Avg (50): 9.61
Episode 150, Reward: 6.78, Running Avg (50): 9.14
Episode 160, Reward: 30.01, Running Avg (50): 9.18
Episode 170, Reward: 7.39, Running Avg (50): 8.93
Episode 180, Reward: 17.10, Running Avg (50): 10.18
Episode 190, Reward: 6.08, Running Avg (50): 10.91
Episode

In [6]:
torch.cuda.empty_cache()

In [None]:
DRIVER_BEHAVIORS = {
    "Safe": {
        "reward_speed_range": [15, 25],
        "collision_reward": -5.0,
        "lane_change_reward": -0.2,
        "right_lane_reward": 0.3,
        "vehicles_density": 1,
    },
    "Normal": {
        "reward_speed_range": [20, 30],
        "collision_reward": -2.0,
        "lane_change_reward": 0.0,
        "right_lane_reward": 0.1,
        "vehicles_density": 1,
    },
    "Aggressive": {
        "reward_speed_range": [30, 40],
        "collision_reward": -0.5,
        "lane_change_reward": +0.3,
        "right_lane_reward": -0.1,
        "vehicles_density": 1,
    },
}

def train(cfg, label=""):
    env = gym.make("highway-v0", render_mode="rgb_array").unwrapped
    env.configure(cfg)
    print(f"\nTraining scenario: {label}")

    state_dim = np.prod(env.observation_space.shape)
    action_dim = env.action_space.n

    # PPO hyperparameters
    max_episodes = 1500
    max_ep_len = 1000
    update_timestep = 4000
    K_epochs = 80
    eps_clip = 0.2
    gamma = 0.99
    lr_actor = 0.0003
    lr_critic = 0.001

    # Reward tracking
    episode_reward_history = []
    running_rewards = []
    last_n_reward = 50

    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip)

    time_step = 0
    i_episode = 0

    while i_episode < max_episodes:
        obs, _ = env.reset()
        current_ep_reward = 0

        for _ in range(max_ep_len):
            action = ppo_agent.select_action(obs)
            next_obs, reward, done, truncated, info = env.step(action)

            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done or truncated)

            time_step += 1
            current_ep_reward += reward

            if time_step % update_timestep == 0:
                ppo_agent.update()

            obs = next_obs

            if done or truncated:
                break

        i_episode += 1
        episode_reward_history.append(current_ep_reward)

        running_reward = (
            np.mean(episode_reward_history[-last_n_reward:])
            if len(episode_reward_history) > last_n_reward
            else np.mean(episode_reward_history)
        )
        running_rewards.append(running_reward)

        if i_episode % 10 == 0:
            print(f"Episode {i_episode}, Reward: {current_ep_reward:.2f}, Running Avg ({last_n_reward}): {running_reward:.2f}")

    # Plot reward history
    plt.figure(figsize=(10, 5))
    plt.plot(episode_reward_history, label="Episode Reward")
    plt.plot(running_rewards, label=f"Running Avg (last {last_n_reward})", linewidth=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title(f"PPO Training Rewards â€“ {label}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    env.close()
    return ppo_agent


if __name__ == "__main__":
    trained_agents = {}
    for label, behavior_cfg in DRIVER_BEHAVIORS.items():
        cfg = BASE_CONFIG.copy()
        cfg.update(behavior_cfg) 
        print(f"\nTraining agent for behavior: {label}")
        agent = train(cfg, label)
        torch.cuda.empty_cache()
        trained_agents[label] = agent
        


Training agent for behavior: Safe

Training scenario: Safe
Episode 10, Reward: 12.36, Running Avg (50): 12.96
Episode 20, Reward: 19.51, Running Avg (50): 14.20
Episode 30, Reward: 29.87, Running Avg (50): 14.79
Episode 40, Reward: 11.64, Running Avg (50): 13.56
Episode 50, Reward: 35.59, Running Avg (50): 14.26
Episode 60, Reward: 3.03, Running Avg (50): 12.63
Episode 70, Reward: 7.90, Running Avg (50): 12.34
Episode 80, Reward: 15.69, Running Avg (50): 11.23
Episode 90, Reward: 4.87, Running Avg (50): 10.41
Episode 100, Reward: 26.50, Running Avg (50): 10.88
Episode 110, Reward: 20.74, Running Avg (50): 12.54
Episode 120, Reward: 3.97, Running Avg (50): 12.55
Episode 130, Reward: 31.17, Running Avg (50): 12.63
Episode 140, Reward: 2.95, Running Avg (50): 13.45
Episode 150, Reward: 2.86, Running Avg (50): 12.11
Episode 160, Reward: 9.81, Running Avg (50): 12.55
Episode 170, Reward: 4.95, Running Avg (50): 13.29
Episode 180, Reward: 13.16, Running Avg (50): 12.70
Episode 190, Reward: 

OutOfMemoryError: CUDA out of memory. Tried to allocate 238.42 GiB. GPU 0 has a total capacity of 8.00 GiB of which 6.50 GiB is free. Of the allocated memory 405.54 MiB is allocated by PyTorch, and 14.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import numpy as np
import pandas as pd

def evaluate_model(env, model, episodes=50, max_steps=1000, is_ppo=True):
    rewards = []
    collisions = []

    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0.0
        collision_count = 0
        done = False
        steps = 0

        while not done and steps < max_steps:
            if is_ppo:
                action = model.select_action(state, deterministic=True)
            else:
                q_vals = model.predict(state[np.newaxis], verbose=0)[0]
                action = int(np.argmax(q_vals))

            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            total_reward += reward
            if info.get("crashed", False) or reward < 0:
                collision_count += 1

            state = next_state
            steps += 1

        rewards.append(total_reward)
        collisions.append(collision_count)

    avg_collisions = np.mean(collisions)
    return rewards, avg_collisions

def evaluate_all_agents(trained_agents, driver_configs, base_config, episodes=50, is_ppo=True):
    results = []

    for label, agent in trained_agents.items():
        print(f"\nEvaluating agent: {label}")

        cfg = base_config.copy()
        cfg.update(driver_configs[label])
        env = gym.make("highway-v0", render_mode=None).unwrapped
        env.configure(cfg)

        rewards, avg_collisions = evaluate_model(env, agent, episodes=episodes, is_ppo=is_ppo)

        results.append({
            "behavior": label,
            "mean_reward": np.mean(rewards),
            "std_reward": np.std(rewards),
            "avg_collisions": avg_collisions
        })

        env.close()

    df = pd.DataFrame(results)
    print("\nEvaluation Results:")
    print(df.to_string(index=False))
    return df


In [None]:
df = evaluate_all_agents(trained_agents, DRIVER_BEHAVIORS, BASE_CONFIG, episodes=50, is_ppo=True)


Evaluating agent: Safe

Evaluating agent: Normal

Evaluating agent: Aggressive

Evaluation Results:
  behavior  mean_reward  std_reward  avg_collisions
      Safe    37.200214    0.701754             0.0
    Normal    32.704489    1.160953             0.1
Aggressive     2.925000    2.616441             1.0


Unnamed: 0,behavior,mean_reward,std_reward,avg_collisions
0,Safe,37.200214,0.701754,0.0
1,Normal,32.704489,1.160953,0.1
2,Aggressive,2.925,2.616441,1.0
