# PPO Hyperparameter Tuning Per Reward Function

This notebook tunes PPO hyperparameters separately for each reward function.
Outputs are saved under `results/hyperparameter_tuning/ppo_per_reward`.

In [1]:
import sys

sys.path.append('..')

import random
import time

import numpy as np
import pandas as pd

from reinforcement_learning_taxi.agents.ppo_agent import PPOAgent
from reinforcement_learning_taxi.environments import make_taxi_env
from reinforcement_learning_taxi.environments.reward_wrappers import REWARD_FUNCTIONS
from reinforcement_learning_taxi.evaluation.metrics import evaluate_agent
from reinforcement_learning_taxi.training.ppo_trainer import PPOTrainer
from reinforcement_learning_taxi.utils.config_utils import load_config, save_optimized_config
from reinforcement_learning_taxi.utils.path_utils import get_repo_root

In [2]:
ROOT_DIR = get_repo_root()
RESULTS_DIR = ROOT_DIR / 'results/hyperparameter_tuning/ppo_per_reward'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

PPO_BASE_CONFIG = load_config(ROOT_DIR / 'src/reinforcement_learning_taxi/configs/ppo_config_baseline.yaml')
PPO_BASE_AGENT = PPO_BASE_CONFIG['agent']

TRAINING_TIMESTEPS = 200000
N_EVAL_EPISODES = 200
N_RANDOM_CONFIGS = 20
SEED = 42

reward_functions = list(REWARD_FUNCTIONS.keys())
print(f'Reward functions: {reward_functions}')
print(f'Training timesteps: {TRAINING_TIMESTEPS:,}')
print(f'Random configs per reward: {N_RANDOM_CONFIGS}')
print(f'Evaluation episodes: {N_EVAL_EPISODES}')

Reward functions: ['default', 'distance_based', 'modified_penalty', 'enhanced']
Training timesteps: 200,000
Random configs per reward: 20
Evaluation episodes: 200


In [3]:
ppo_param_space = {
    'learning_rate': [1e-4, 3e-4, 5e-4, 1e-3],
    'n_steps': [512, 1024, 2048],
    'batch_size': [32, 64, 128],
    'clip_range': [0.1, 0.2, 0.3],
    'ent_coef': [0.0, 0.01, 0.05],
}

print('PPO Search Space:')
for param, values in ppo_param_space.items():
    print(f'  {param}: {values}')

total_combinations = np.prod([len(v) for v in ppo_param_space.values()])
print(f'Total possible combinations: {total_combinations}')

PPO Search Space:
  learning_rate: [0.0001, 0.0003, 0.0005, 0.001]
  n_steps: [512, 1024, 2048]
  batch_size: [32, 64, 128]
  clip_range: [0.1, 0.2, 0.3]
  ent_coef: [0.0, 0.01, 0.05]
Total possible combinations: 324


In [4]:
def random_sample_params(param_space, n_samples, seed=42):
    random.seed(seed)
    configs = []
    for _ in range(n_samples):
        config = {param: random.choice(values) for param, values in param_space.items()}
        configs.append(config)
    return configs

In [5]:
summary_rows = []

for reward_index, reward_name in enumerate(reward_functions):
    print(f"\n{'=' * 70}")
    print(f'PPO tuning for reward: {reward_name}')
    print(f"{'=' * 70}")

    configs = random_sample_params(ppo_param_space, N_RANDOM_CONFIGS, seed=SEED + reward_index * 100)
    reward_results = []

    for i, config in enumerate(configs, 1):
        print(f'\nConfiguration {i}/{N_RANDOM_CONFIGS}: {config}')
        start_time = time.time()
        env = None
        eval_env_default = None

        try:
            env = make_taxi_env(
                use_feature_wrapper=True,
                reward_wrapper_name=reward_name,
                use_action_masking=True,
            )

            agent = PPOAgent(
                env=env,
                policy=PPO_BASE_AGENT['policy'],
                learning_rate=config['learning_rate'],
                n_steps=config['n_steps'],
                batch_size=config['batch_size'],
                n_epochs=PPO_BASE_AGENT['n_epochs'],
                gamma=PPO_BASE_AGENT['gamma'],
                gae_lambda=PPO_BASE_AGENT['gae_lambda'],
                clip_range=config['clip_range'],
                clip_range_vf=PPO_BASE_AGENT['clip_range_vf'],
                ent_coef=config['ent_coef'],
                vf_coef=PPO_BASE_AGENT['vf_coef'],
                max_grad_norm=PPO_BASE_AGENT['max_grad_norm'],
                policy_kwargs=PPO_BASE_AGENT.get('policy_kwargs') or {},
                verbose=0,
                seed=PPO_BASE_AGENT.get('seed'),
                use_action_masking=True,
            )

            trainer = PPOTrainer(
                env=env,
                agent=agent,
                log_dir=ROOT_DIR / f'results/logs/reward_tuning/ppo_{reward_name}/config_{i}',
                eval_freq=20000,
            )
            stats = trainer.train(total_timesteps=TRAINING_TIMESTEPS)

            eval_shaped = evaluate_agent(agent, env, n_episodes=N_EVAL_EPISODES, deterministic=True)

            eval_env_default = make_taxi_env(
                use_feature_wrapper=True,
                reward_wrapper_name=None,
                use_action_masking=True,
            )
            eval_default = evaluate_agent(
                agent,
                eval_env_default,
                n_episodes=N_EVAL_EPISODES,
                deterministic=True,
            )

            training_time = stats.get('training_time', time.time() - start_time)

            reward_results.append({
                'config_id': i,
                'reward_name': reward_name,
                **config,
                'mean_reward_shaped': eval_shaped['mean_reward'],
                'std_reward_shaped': eval_shaped['std_reward'],
                'success_rate_default': eval_default['success_rate'],
                'mean_length_default': eval_default['mean_length'],
                'training_time': training_time,
            })

            print(f"  Mean reward (shaped): {eval_shaped['mean_reward']:.2f}")
            print(f"  Success rate (default): {eval_default['success_rate']:.2%}")
        except Exception as exc:
            print(f'  ERROR: {exc}')
        finally:
            if eval_env_default is not None:
                eval_env_default.close()
            if env is not None:
                env.close()

    reward_df = pd.DataFrame(reward_results)
    reward_df.sort_values(by='mean_reward_shaped', ascending=False, inplace=True)

    reward_csv = RESULTS_DIR / f'ppo_tuning_{reward_name}.csv'
    reward_df.to_csv(reward_csv, index=False)
    print(f'\nSaved PPO tuning results to {reward_csv}')

    if not reward_df.empty:
        best_row = reward_df.iloc[0]
        best_params = {k: best_row[k] for k in ppo_param_space.keys()}
        best_yaml = RESULTS_DIR / f'ppo_{reward_name}_best.yaml'
        save_optimized_config(
            best_params=best_params,
            base_config_path=ROOT_DIR / 'src/reinforcement_learning_taxi/configs/ppo_config_baseline.yaml',
            output_path=best_yaml,
            algorithm='PPO',
        )
        summary_rows.append({
            'reward_name': reward_name,
            'best_config_id': int(best_row['config_id']),
            'best_mean_reward_shaped': float(best_row['mean_reward_shaped']),
            'best_success_rate_default': float(best_row['success_rate_default']),
        })

summary_df = pd.DataFrame(summary_rows)
summary_csv = RESULTS_DIR / 'ppo_tuning_summary.csv'
summary_df.to_csv(summary_csv, index=False)
print(f'\nSaved PPO tuning summary to {summary_csv}')


PPO tuning for reward: default

Configuration 1/20: {'learning_rate': 0.0001, 'n_steps': 512, 'batch_size': 128, 'clip_range': 0.2, 'ent_coef': 0.0}
Eval num_timesteps=20000, episode_reward=-195.72 +/- 29.96
Episode length: 196.14 +/- 27.02
New best mean reward!
Eval num_timesteps=40000, episode_reward=-195.75 +/- 29.75
Episode length: 196.17 +/- 26.81
Eval num_timesteps=60000, episode_reward=-200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=-191.63 +/- 41.01
Episode length: 192.47 +/- 36.89
New best mean reward!
Eval num_timesteps=100000, episode_reward=-170.96 +/- 71.98
Episode length: 173.90 +/- 64.69
New best mean reward!
Eval num_timesteps=120000, episode_reward=-162.44 +/- 80.18
Episode length: 166.22 +/- 72.11
New best mean reward!
Eval num_timesteps=140000, episode_reward=-81.34 +/- 103.08
Episode length: 93.31 +/- 92.68
New best mean reward!
Eval num_timesteps=160000, episode_reward=-121.24 +/- 100.61
Episode length: 129.22 +/- 90.42
Ev