# DQN Hyperparameter Tuning Per Reward Function

This notebook tunes DQN hyperparameters separately for each reward function.
Outputs are saved under `results/hyperparameter_tuning/dqn_per_reward`.

In [1]:
import sys

sys.path.append('..')

import random
import time

import numpy as np
import pandas as pd

from reinforcement_learning_taxi.agents.dqn_agent import DQNAgent
from reinforcement_learning_taxi.environments import make_taxi_env
from reinforcement_learning_taxi.environments.reward_wrappers import REWARD_FUNCTIONS
from reinforcement_learning_taxi.evaluation.metrics import evaluate_agent
from reinforcement_learning_taxi.training.dqn_trainer import DQNTrainer
from reinforcement_learning_taxi.utils.config_utils import load_config, save_optimized_config
from reinforcement_learning_taxi.utils.path_utils import get_repo_root

In [2]:
ROOT_DIR = get_repo_root()
RESULTS_DIR = ROOT_DIR / 'results/hyperparameter_tuning/dqn_per_reward'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

DQN_BASE_CONFIG = load_config(ROOT_DIR / 'src/reinforcement_learning_taxi/configs/dqn_config_baseline.yaml')
DQN_BASE_AGENT = DQN_BASE_CONFIG['agent']

TRAINING_TIMESTEPS = 250000
N_EVAL_EPISODES = 100
N_RANDOM_CONFIGS = 5
SEED = 42

reward_functions = list(REWARD_FUNCTIONS.keys())
print(f'Reward functions: {reward_functions}')
print(f'Training timesteps: {TRAINING_TIMESTEPS:,}')
print(f'Random configs per reward: {N_RANDOM_CONFIGS}')
print(f'Evaluation episodes: {N_EVAL_EPISODES}')


Reward functions: ['default', 'distance_based', 'modified_penalty', 'enhanced']
Training timesteps: 250,000
Random configs per reward: 5
Evaluation episodes: 100


In [3]:
dqn_param_space = {
    'learning_rate': [5e-5, 1e-4, 2e-4, 5e-4],
    'batch_size': [32, 64, 128],
    'buffer_size': [50000, 100000, 200000],
    'gamma': [0.95, 0.99, 0.999],
    'target_update_interval': [500, 1000, 2000],
    'exploration_fraction': [0.8, 0.9],
    'exploration_final_eps': [0.1, 0.2, 0.3],
    'learning_starts': [10000, 20000],
    'train_freq': [1, 4],
    'gradient_steps': [1, 4],
}

print('DQN Search Space:')
for param, values in dqn_param_space.items():
    print(f'  {param}: {values}')

total_combinations = np.prod([len(v) for v in dqn_param_space.values()])
print(f'Total possible combinations: {total_combinations}')

DQN Search Space:
  learning_rate: [5e-05, 0.0001, 0.0002, 0.0005]
  batch_size: [32, 64, 128]
  buffer_size: [50000, 100000, 200000]
  gamma: [0.95, 0.99, 0.999]
  target_update_interval: [500, 1000, 2000]
  exploration_fraction: [0.8, 0.9]
  exploration_final_eps: [0.1, 0.2, 0.3]
  learning_starts: [10000, 20000]
  train_freq: [1, 4]
  gradient_steps: [1, 4]
Total possible combinations: 15552


In [4]:
def random_sample_params(param_space, n_samples, seed=42):
    random.seed(seed)
    configs = []
    for _ in range(n_samples):
        config = {param: random.choice(values) for param, values in param_space.items()}
        configs.append(config)
    return configs

In [5]:
summary_rows = []

for reward_index, reward_name in enumerate(reward_functions):
    print(f"\n{'=' * 70}")
    print(f"DQN tuning for reward: {reward_name}")
    print(f"{'=' * 70}")

    configs = random_sample_params(dqn_param_space, N_RANDOM_CONFIGS, seed=SEED + reward_index * 100)
    reward_results = []

    for i, config in enumerate(configs, 1):
        print(f'\nConfiguration {i}/{N_RANDOM_CONFIGS}: {config}')
        start_time = time.time()
        env = None
        eval_env_default = None

        try:
            env = make_taxi_env(
                use_feature_wrapper=True,
                reward_wrapper_name=reward_name,
                use_action_masking=False,
            )

            agent = DQNAgent(
                env=env,
                policy=DQN_BASE_AGENT['policy'],
                learning_rate=config['learning_rate'],
                buffer_size=config['buffer_size'],
                learning_starts=config['learning_starts'],
                batch_size=config['batch_size'],
                gamma=config['gamma'],
                train_freq=config['train_freq'],
                gradient_steps=config['gradient_steps'],
                target_update_interval=config['target_update_interval'],
                exploration_fraction=config['exploration_fraction'],
                exploration_initial_eps=DQN_BASE_AGENT['exploration_initial_eps'],
                exploration_final_eps=config['exploration_final_eps'],
                policy_kwargs=DQN_BASE_AGENT.get('policy_kwargs') or {},
                verbose=0,
                seed=DQN_BASE_AGENT.get('seed'),
            )

            trainer = DQNTrainer(
                env=env,
                agent=agent,
                log_dir=ROOT_DIR / f'results/logs/reward_tuning/dqn_{reward_name}/config_{i}',
                eval_freq=20000,
                eval_episodes=N_EVAL_EPISODES,
            )
            stats = trainer.train(total_timesteps=TRAINING_TIMESTEPS)

            eval_shaped = evaluate_agent(agent, env, n_episodes=N_EVAL_EPISODES, deterministic=True)

            eval_env_default = make_taxi_env(
                use_feature_wrapper=True,
                reward_wrapper_name=None,
                use_action_masking=False,
            )
            eval_default = evaluate_agent(
                agent,
                eval_env_default,
                n_episodes=N_EVAL_EPISODES,
                deterministic=True,
            )

            training_time = stats.get('training_time', time.time() - start_time)

            reward_results.append({
                'config_id': i,
                'reward_name': reward_name,
                **config,
                'mean_reward_shaped': eval_shaped['mean_reward'],
                'std_reward_shaped': eval_shaped['std_reward'],
                'success_rate_default': eval_default['success_rate'],
                'mean_length_default': eval_default['mean_length'],
                'training_time': training_time,
            })

            print(f"  Mean reward (shaped): {eval_shaped['mean_reward']:.2f}")
            print(f"  Success rate (default): {eval_default['success_rate']:.2%}")
        except Exception as exc:
            print(f'  ERROR: {exc}')
        finally:
            if eval_env_default is not None:
                eval_env_default.close()
            if env is not None:
                env.close()

    reward_df = pd.DataFrame(reward_results)
    reward_df.sort_values(by='mean_reward_shaped', ascending=False, inplace=True)

    reward_csv = RESULTS_DIR / f'dqn_tuning_{reward_name}.csv'
    reward_df.to_csv(reward_csv, index=False)
    print(f'\nSaved DQN tuning results to {reward_csv}')

    if not reward_df.empty:
        best_row = reward_df.iloc[0]
        best_params = {k: best_row[k] for k in dqn_param_space.keys()}
        best_yaml = RESULTS_DIR / f'dqn_{reward_name}_best.yaml'
        save_optimized_config(
            best_params=best_params,
            base_config_path=ROOT_DIR / 'src/reinforcement_learning_taxi/configs/dqn_config_baseline.yaml',
            output_path=best_yaml,
            algorithm='DQN',
        )
        summary_rows.append({
            'reward_name': reward_name,
            'best_config_id': int(best_row['config_id']),
            'best_mean_reward_shaped': float(best_row['mean_reward_shaped']),
            'best_success_rate_default': float(best_row['success_rate_default']),
        })

summary_df = pd.DataFrame(summary_rows)
summary_csv = RESULTS_DIR / 'dqn_tuning_summary.csv'
summary_df.to_csv(summary_csv, index=False)
print(f'\nSaved DQN tuning summary to {summary_csv}')


DQN tuning for reward: default

Configuration 1/5: {'learning_rate': 5e-05, 'batch_size': 32, 'buffer_size': 200000, 'gamma': 0.99, 'target_update_interval': 500, 'exploration_fraction': 0.8, 'exploration_final_eps': 0.1, 'learning_starts': 10000, 'train_freq': 1, 'gradient_steps': 4}
Eval num_timesteps=20000, episode_reward=-200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-147.61 +/- 90.75
Episode length: 152.86 +/- 81.65
New best mean reward!
Eval num_timesteps=60000, episode_reward=-98.02 +/- 104.06
Episode length: 108.31 +/- 93.56
New best mean reward!
Eval num_timesteps=80000, episode_reward=8.07 +/- 2.64
Episode length: 12.93 +/- 2.64
New best mean reward!
Eval num_timesteps=100000, episode_reward=8.37 +/- 2.77
Episode length: 12.63 +/- 2.77
New best mean reward!
Eval num_timesteps=120000, episode_reward=8.02 +/- 2.60
Episode length: 12.98 +/- 2.60
Eval num_timesteps=140000, episode_reward=7.88 +/- 2.53
Episode lengt