In [24]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import deque
import random
import matplotlib.pyplot as plt
import pprint
import pandas as pd

import highway_env

In [20]:
# --- HYPERPARAMETERS ---
NUM_EPISODES       = 500
MAX_STEPS_PER_EP   = 200
BATCH_SIZE         = 64
BUFFER_SIZE        = 50_000
GAMMA              = 0.99
EPS_START, EPS_END = 1.0, 0.01
EPS_DECAY          = 0.99
TARGET_UPD_FREQ    = 1_000      # steps
LR                 = 5e-4

# --- BASE CONFIG & VARIANTS ---
BASE_CONFIG = {
    "observation":       {"type": "Kinematics"},
    "action":            {"type": "DiscreteMetaAction"},
    "lanes_count":       4,
    "vehicles_count":    50,
    "controlled_vehicles": 1,
    "duration":          40,
    "ego_spacing":       2,
    "vehicles_density":  1,
    "collision_reward":  -1,
    "right_lane_reward": 0.1,
    "high_speed_reward": 0.4,        # keep constant
    "lane_change_reward": 0,
    "normalize_reward":  True,
    "offroad_terminal":  False,
}

SCENARIOS = {
    "Slow":   {"reward_speed_range": [10, 20]},
    "Normal": {"reward_speed_range": [20, 30]},
    "Fast":   {"reward_speed_range": [30, 40]},
}

In [21]:
def create_q_model(input_shape, num_actions):
    """Simple MLP Q-network."""
    return tf.keras.Sequential([
        layers.Flatten(input_shape=input_shape),
        layers.Dense(128, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(num_actions, activation="linear"),
    ])


def train_on_env(env_config, label):
    """Train a fresh DQN, recording per‐episode total rewards."""
    # 1) build env
    cfg = BASE_CONFIG.copy()
    cfg.update(env_config)
    env = gym.make("highway-v0", render_mode="rgb_array", config=cfg)
    obs_shape   = env.observation_space.shape
    num_actions = env.action_space.n

    # 2) networks, buffer, optimizer
    q_model  = create_q_model(obs_shape, num_actions)
    q_target = create_q_model(obs_shape, num_actions)
    q_target.set_weights(q_model.get_weights())
    buffer   = deque(maxlen=BUFFER_SIZE)
    optimizer = tf.keras.optimizers.Adam(LR)
    loss_fn   = tf.keras.losses.MeanSquaredError()

    epsilon    = EPS_START
    step_count = 0
    episode_rewards = []  # <-- track total reward per episode

    # 3) training loop
    for ep in range(1, NUM_EPISODES + 1):
        state, _ = env.reset(seed=ep)
        total_reward = 0.0

        for t in range(MAX_STEPS_PER_EP):
            # ε-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                q_vals = q_model.predict(state[np.newaxis], verbose=0)[0]
                action = int(np.argmax(q_vals))

            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            buffer.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            step_count += 1

            # learning update
            if len(buffer) >= BATCH_SIZE:
                batch = random.sample(buffer, BATCH_SIZE)
                S  = np.array([b[0] for b in batch])
                A  = np.array([b[1] for b in batch])
                R  = np.array([b[2] for b in batch], dtype=float)
                S2 = np.array([b[3] for b in batch])
                D  = np.array([b[4] for b in batch], dtype=float)

                q_next = q_target.predict(S2, verbose=0)
                max_q  = np.max(q_next, axis=1)
                y      = R + (1 - D) * GAMMA * max_q

                with tf.GradientTape() as tape:
                    q_pred = q_model(S)
                    q_sel  = tf.reduce_sum(q_pred * tf.one_hot(A, num_actions), axis=1)
                    loss   = loss_fn(y, q_sel)
                grads = tape.gradient(loss, q_model.trainable_variables)
                optimizer.apply_gradients(zip(grads, q_model.trainable_variables))

            # periodically sync target network
            if step_count % TARGET_UPD_FREQ == 0:
                q_target.set_weights(q_model.get_weights())

            if done:
                break

        # record episode reward and decay ε
        episode_rewards.append(total_reward)
        epsilon = max(EPS_END, epsilon * EPS_DECAY)

        if ep % 10 == 0:
            print(f"[{label}] Episode {ep}/{NUM_EPISODES} — Reward: {total_reward:.1f}, ε: {epsilon:.3f}")

    return env, q_model, episode_rewards

In [22]:
pprint.pprint(SCENARIOS)
trained = {}
for label, env_cfg in SCENARIOS.items():
    print("\n" + "="*40)
    print(f" Training scenario ▶️ {label}")
    print("="*40)
    env, q_model = train_on_env(env_cfg, label)
    trained[label] = (env, q_model)

print("\nAll three scenarios trained for 500 episodes.")

{'Fast': {'reward_speed_range': [30, 40]},
 'Normal': {'reward_speed_range': [20, 30]},
 'Slow': {'reward_speed_range': [10, 20]}}

 Training scenario ▶️ Slow


  super().__init__(**kwargs)


[Slow] Episode 10/500 — Reward: 2.2, ε: 0.904
[Slow] Episode 20/500 — Reward: 7.2, ε: 0.818
[Slow] Episode 30/500 — Reward: 3.8, ε: 0.740
[Slow] Episode 40/500 — Reward: 21.6, ε: 0.669
[Slow] Episode 50/500 — Reward: 38.0, ε: 0.605
[Slow] Episode 60/500 — Reward: 19.8, ε: 0.547
[Slow] Episode 70/500 — Reward: 5.3, ε: 0.495
[Slow] Episode 80/500 — Reward: 13.3, ε: 0.448
[Slow] Episode 90/500 — Reward: 11.1, ε: 0.405
[Slow] Episode 100/500 — Reward: 27.4, ε: 0.366
[Slow] Episode 110/500 — Reward: 16.2, ε: 0.331
[Slow] Episode 120/500 — Reward: 37.9, ε: 0.299
[Slow] Episode 130/500 — Reward: 2.3, ε: 0.271
[Slow] Episode 140/500 — Reward: 39.7, ε: 0.245
[Slow] Episode 150/500 — Reward: 39.0, ε: 0.221
[Slow] Episode 160/500 — Reward: 37.9, ε: 0.200
[Slow] Episode 170/500 — Reward: 37.4, ε: 0.181
[Slow] Episode 180/500 — Reward: 37.7, ε: 0.164
[Slow] Episode 190/500 — Reward: 38.2, ε: 0.148
[Slow] Episode 200/500 — Reward: 38.3, ε: 0.134
[Slow] Episode 210/500 — Reward: 39.0, ε: 0.121
[Slow]

  super().__init__(**kwargs)


[Normal] Episode 10/500 — Reward: 14.7, ε: 0.904
[Normal] Episode 20/500 — Reward: 12.8, ε: 0.818
[Normal] Episode 30/500 — Reward: 10.4, ε: 0.740
[Normal] Episode 40/500 — Reward: 10.1, ε: 0.669
[Normal] Episode 50/500 — Reward: 2.1, ε: 0.605
[Normal] Episode 60/500 — Reward: 21.3, ε: 0.547
[Normal] Episode 70/500 — Reward: 4.8, ε: 0.495
[Normal] Episode 80/500 — Reward: 12.8, ε: 0.448
[Normal] Episode 90/500 — Reward: 9.5, ε: 0.405
[Normal] Episode 100/500 — Reward: 9.7, ε: 0.366
[Normal] Episode 110/500 — Reward: 19.6, ε: 0.331
[Normal] Episode 120/500 — Reward: 30.5, ε: 0.299
[Normal] Episode 130/500 — Reward: 7.7, ε: 0.271
[Normal] Episode 140/500 — Reward: 28.7, ε: 0.245
[Normal] Episode 150/500 — Reward: 10.9, ε: 0.221
[Normal] Episode 160/500 — Reward: 9.5, ε: 0.200
[Normal] Episode 170/500 — Reward: 3.3, ε: 0.181
[Normal] Episode 180/500 — Reward: 17.0, ε: 0.164
[Normal] Episode 190/500 — Reward: 13.5, ε: 0.148
[Normal] Episode 200/500 — Reward: 10.2, ε: 0.134
[Normal] Episode

  super().__init__(**kwargs)


[Fast] Episode 10/500 — Reward: 10.4, ε: 0.904
[Fast] Episode 20/500 — Reward: 28.4, ε: 0.818
[Fast] Episode 30/500 — Reward: 28.8, ε: 0.740
[Fast] Episode 40/500 — Reward: 3.6, ε: 0.669
[Fast] Episode 50/500 — Reward: 14.5, ε: 0.605
[Fast] Episode 60/500 — Reward: 12.8, ε: 0.547
[Fast] Episode 70/500 — Reward: 2.7, ε: 0.495
[Fast] Episode 80/500 — Reward: 2.2, ε: 0.448
[Fast] Episode 90/500 — Reward: 16.2, ε: 0.405
[Fast] Episode 100/500 — Reward: 1.5, ε: 0.366
[Fast] Episode 110/500 — Reward: 28.4, ε: 0.331
[Fast] Episode 120/500 — Reward: 24.3, ε: 0.299
[Fast] Episode 130/500 — Reward: 27.1, ε: 0.271
[Fast] Episode 140/500 — Reward: 15.6, ε: 0.245
[Fast] Episode 150/500 — Reward: 29.3, ε: 0.221
[Fast] Episode 160/500 — Reward: 26.7, ε: 0.200
[Fast] Episode 170/500 — Reward: 28.1, ε: 0.181
[Fast] Episode 180/500 — Reward: 26.7, ε: 0.164
[Fast] Episode 190/500 — Reward: 27.1, ε: 0.148
[Fast] Episode 200/500 — Reward: 26.7, ε: 0.134
[Fast] Episode 210/500 — Reward: 26.7, ε: 0.121
[Fast

In [25]:
def evaluate_model(env, model, episodes=50):
    rewards = []
    collisions = []

    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0.0
        collision_count = 0
        done = False
        steps = 0

        while not done and steps < MAX_STEPS_PER_EP:
            q_vals = model.predict(state[np.newaxis], verbose=0)[0]
            action = int(np.argmax(q_vals))
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            total_reward += reward
            # increment collision counter if flagged or via negative reward
            if info.get("crashed", False) or reward < 0:
                collision_count += 1

            state = next_state
            steps += 1

        rewards.append(total_reward)
        collisions.append(collision_count)

    avg_collisions = np.mean(collisions)
    return rewards, avg_collisions

# Collect metrics for each scenario
metrics = []
for label, (env, q_model) in trained.items():
    rewards = evaluate_model(env, q_model, episodes=50)
    metrics.append({
        "scenario":     label,
        "mean_reward":  np.mean(rewards),
        "std_reward":   np.std(rewards),
    })

# Display as a table
df = pd.DataFrame(metrics)
print(df.to_string(index=False))

scenario  mean_reward  std_reward
    Slow    37.030362    6.439829
  Normal    28.102651    6.022562
    Fast    25.687556    7.172528
