# PPO Training for Lateral Control

This notebook implements **Proximal Policy Optimization (PPO)** for lateral vehicle control.

**Optimized for:**
- Single GPU (A100) training
- JupyterLab environment
- No root permissions required
- Local training (no AWS/cloud infrastructure)

In [2]:
a = [i for i in range(10)]
a[-2:]
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## 1. Setup Environment and Dependencies

Install and import required libraries.

In [None]:
# Install dependencies (uncomment if needed)
# !pip install torch numpy pandas pyyaml tensorboard matplotlib tqdm gym

import os
import sys
import yaml
import glob
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm
from IPython.display import clear_output

# Add current directory to path
sys.path.append(os.getcwd())

# Import custom modules
from environment import LateralControlEnv, VectorizedLateralControlEnv
from policy import PPOAgent
from utils import RolloutBuffer, Logger

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load Configuration

Load training hyperparameters from YAML config file.

In [None]:
# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Set random seeds
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(config['seed'])

# Print key configuration
print("Configuration:")
print(f"  Seed: {config['seed']}")
print(f"  Number of environments: {config['env']['num_envs']}")
print(f"  Learning rate: {config['ppo']['learning_rate']}")
print(f"  Total timesteps: {config['training']['total_timesteps']}")
print(f"  Device: {config['device']}")

## 3. Create Environment

Set up vectorized lateral control environment.

In [None]:
# Get training data files
data_files = []
for platform in config['env']['platforms']:
    platform_files = glob.glob(os.path.join(config['env']['data_path'], platform, '*.csv'))
    data_files.extend(platform_files)

print(f"Found {len(data_files)} training files")

# Sample files for parallel environments
num_envs = config['env']['num_envs']
sampled_files = random.sample(data_files, min(num_envs, len(data_files)))

# Create vectorized environment
model_path = '../models/tinyphysics.onnx'
envs = VectorizedLateralControlEnv(
    sampled_files,
    model_path=model_path,
    max_steps=config['env']['max_steps']
)

print(f"Created {envs.num_envs} parallel environments")
print(f"Observation space: {envs.observation_space}")
print(f"Action space: {envs.action_space}")

## 4. Initialize PPO Agent

Create the PPO agent with Actor-Critic network.

In [None]:
# Get observation and action dimensions
obs_dim = envs.observation_space.shape[0]
action_dim = envs.action_space.shape[0]

# Create PPO agent
agent = PPOAgent(
    obs_dim=obs_dim,
    action_dim=action_dim,
    hidden_sizes=config['ppo']['hidden_sizes'],
    activation=config['ppo']['activation'],
    learning_rate=config['ppo']['learning_rate'],
    gamma=config['ppo']['gamma'],
    gae_lambda=config['ppo']['gae_lambda'],
    clip_ratio=config['ppo']['clip_ratio'],
    value_coef=config['ppo']['value_coef'],
    entropy_coef=config['ppo']['entropy_coef'],
    max_grad_norm=config['ppo']['max_grad_norm'],
    device=config['device']
)

print(f"Created PPO agent:")
print(f"  Observation dim: {obs_dim}")
print(f"  Action dim: {action_dim}")
print(f"  Hidden sizes: {config['ppo']['hidden_sizes']}")
print(f"  Total parameters: {sum(p.numel() for p in agent.ac.parameters()):,}")

## 5. Training Loop

Main PPO training loop with progress tracking.

In [None]:
# Create directories
os.makedirs(config['training']['model_save_path'], exist_ok=True)
os.makedirs(config['training']['log_dir'], exist_ok=True)

# Create rollout buffer
buffer = RolloutBuffer(
    num_envs=num_envs,
    num_steps=config['ppo']['num_steps'],
    obs_dim=obs_dim,
    action_dim=action_dim
)

# Training statistics
training_stats = {
    'episode_rewards': [],
    'episode_costs': [],
    'episode_lengths': [],
    'policy_losses': [],
    'value_losses': [],
    'entropies': []
}

# Training parameters
total_timesteps = config['training']['total_timesteps']
num_steps = config['ppo']['num_steps']
num_updates = total_timesteps // (num_steps * num_envs)

print(f"Starting PPO training:")
print(f"  Total updates: {num_updates}")
print(f"  Total timesteps: {total_timesteps}")
print(f"  Steps per update: {num_steps}")
print(f"  Num environments: {num_envs}")

In [None]:
# Main training loop
global_step = 0

for update in tqdm(range(num_updates), desc="Training Progress"):
    # Collect rollouts
    obs = envs.reset()
    episode_rewards_batch = []
    episode_costs_batch = []
    
    for step in range(num_steps):
        # Get actions
        actions, values = agent.get_action(obs)
        
        # Compute log probs
        with torch.no_grad():
            obs_tensor = torch.FloatTensor(obs).to(agent.device)
            action_tensor = torch.FloatTensor(actions).to(agent.device)
            _, log_probs, _ = agent.ac.evaluate_actions(obs_tensor, action_tensor)
            log_probs = log_probs.cpu().numpy().flatten()
        
        # Step environments
        next_obs, rewards, dones, infos = envs.step(actions)
        
        # Store transition
        buffer.add(obs, actions, rewards, values.flatten(), log_probs, dones)
        
        # Track episodes
        for i, (done, info) in enumerate(zip(dones, infos)):
            if done:
                episode_rewards_batch.append(sum([info.get('cost', 0) for info in infos]))
                episode_costs_batch.append(info.get('total_cost', 0))
        
        obs = next_obs
    
    # Compute advantages and returns
    with torch.no_grad():
        _, next_values = agent.get_action(obs)
    
    advantages = np.zeros((num_steps, num_envs), dtype=np.float32)
    returns = np.zeros((num_steps, num_envs), dtype=np.float32)
    
    for env_idx in range(num_envs):
        env_rewards = buffer.rewards[:, env_idx]
        env_values = buffer.values[:, env_idx]
        env_dones = buffer.dones[:, env_idx]
        env_next_value = next_values[env_idx]
        
        env_advantages, env_returns = agent.compute_gae(
            env_rewards, env_values, env_dones, env_next_value
        )
        
        advantages[:, env_idx] = env_advantages
        returns[:, env_idx] = env_returns
    
    # Get rollout data and update
    rollout_data = buffer.get(next_values=None, advantages=advantages, returns=returns)
    train_stats = agent.update(
        rollout_data,
        num_epochs=config['ppo']['num_epochs'],
        batch_size=config['ppo']['batch_size']
    )
    
    # Update statistics
    if episode_rewards_batch:
        training_stats['episode_rewards'].append(np.mean(episode_rewards_batch))
    if episode_costs_batch:
        training_stats['episode_costs'].append(np.mean(episode_costs_batch))
    training_stats['policy_losses'].append(train_stats['policy_loss'])
    training_stats['value_losses'].append(train_stats['value_loss'])
    training_stats['entropies'].append(train_stats['entropy'])
    
    # Update global step
    global_step += num_steps * num_envs
    
    # Print progress
    if update % 10 == 0:
        clear_output(wait=True)
        print(f"Update {update}/{num_updates} | Step {global_step}/{total_timesteps}")
        if training_stats['episode_rewards']:
            print(f"  Mean Reward: {training_stats['episode_rewards'][-1]:.2f}")
        if training_stats['episode_costs']:
            print(f"  Mean Cost: {training_stats['episode_costs'][-1]:.4f}")
        print(f"  Policy Loss: {train_stats['policy_loss']:.4f}")
        print(f"  Value Loss: {train_stats['value_loss']:.4f}")
    
    # Save checkpoint
    if update % (config['training']['save_freq'] // (num_steps * num_envs)) == 0:
        save_path = os.path.join(
            config['training']['model_save_path'],
            f"checkpoint_{global_step}.pt"
        )
        agent.save(save_path)
    
    # Reset buffer
    buffer.reset()

print("\\nTraining complete!")

## 6. Visualize Training Metrics

Plot training curves to monitor progress.

In [None]:
# Plot training metrics
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards
if training_stats['episode_rewards']:
    axs[0, 0].plot(training_stats['episode_rewards'])
    axs[0, 0].set_title('Episode Rewards')
    axs[0, 0].set_xlabel('Update')
    axs[0, 0].set_ylabel('Mean Reward')
    axs[0, 0].grid(True)

# Episode costs
if training_stats['episode_costs']:
    axs[0, 1].plot(training_stats['episode_costs'])
    axs[0, 1].set_title('Episode Costs')
    axs[0, 1].set_xlabel('Update')
    axs[0, 1].set_ylabel('Mean Cost')
    axs[0, 1].grid(True)

# Policy loss
axs[1, 0].plot(training_stats['policy_losses'])
axs[1, 0].set_title('Policy Loss')
axs[1, 0].set_xlabel('Update')
axs[1, 0].set_ylabel('Loss')
axs[1, 0].grid(True)

# Value loss
axs[1, 1].plot(training_stats['value_losses'])
axs[1, 1].set_title('Value Loss')
axs[1, 1].set_xlabel('Update')
axs[1, 1].set_ylabel('Loss')
axs[1, 1].grid(True)

plt.tight_layout()
plt.savefig(os.path.join(config['training']['model_save_path'], 'training_curves.png'))
plt.show()

## 7. Save Model

Save the final trained model.

In [None]:
# Save final model
final_model_path = os.path.join(config['training']['model_save_path'], "final_model.pt")
agent.save(final_model_path)
print(f"Model saved to: {final_model_path}")

# Save training statistics
stats_path = os.path.join(config['training']['model_save_path'], "training_stats.npz")
np.savez(stats_path, **training_stats)
print(f"Training stats saved to: {stats_path}")

# Close environments
envs.close()
print("Training complete!")