# Password Game - Qwen3-0.6B PPO Training with VERL

This notebook implements reinforcement learning training for the Password Game task using:
- **Model**: Qwen3-0.6B
- **Algorithm**: PPO (Proximal Policy Optimization)
- **Framework**: VERL
- **Hardware**: Single H100 GPU (no DDP)

## Task Overview

The Password Game is a cumulative constraint satisfaction task with 26 progressive rules:
- Rules must be satisfied **cumulatively** (all previous + current rule)
- Rules range from simple (length ≥ 5) to complex (atomic numbers sum to 200)
- Reward: +1 per satisfied rule, -0.1 per character length

## Workflow

1. **Setup**: Install VERL and dependencies
2. **Baseline Evaluation**: Test untrained Qwen3-0.6B
3. **Environment**: Password Game API wrapper
4. **Training**: PPO with shaped rewards
5. **Evaluation**: Compare trained vs baseline

## 1. Setup and Installation

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install VERL and dependencies
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
!pip install transformers==4.44.0 accelerate==0.33.0
!pip install vllm==0.5.4 ray==2.10
!pip install flash-attn --no-build-isolation
!pip install requests pandas numpy matplotlib seaborn wandb

In [None]:
# Clone and install VERL
import os
if not os.path.exists('/home/user/verl'):
    !git clone https://github.com/volcengine/verl /home/user/verl
    !cd /home/user/verl && pip install -e .

In [None]:
# Import core libraries
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
import json
from datetime import datetime
import wandb

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 2. Password Game Environment Setup

In [None]:
# Start the Password Game API server in background
import subprocess
import time

# Kill any existing server
!pkill -f "uvicorn main:app"
time.sleep(2)

# Start new server
api_process = subprocess.Popen(
    ["uvicorn", "main:app", "--port", "8000"],
    cwd="/home/user/notebooks/tasks/password-game",
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)
print("Starting Password Game API server...")
time.sleep(5)
print("Server started on http://localhost:8000")

In [None]:
# Test server connectivity
import requests

try:
    response = requests.post("http://localhost:8000/start")
    if response.status_code == 200:
        data = response.json()
        print("✓ Password Game API is running")
        print(f"  Token: {data['token']}")
        print(f"  Current rule: {data['current_rule']}")
    else:
        print(f"✗ Server returned status {response.status_code}")
except Exception as e:
    print(f"✗ Server connection failed: {e}")

In [None]:
# Password Game Environment Wrapper

@dataclass
class GameState:
    """Represents the current state of a password game."""
    token: str
    current_rule_index: int
    current_rule: str
    all_rules: List[str]
    game_active: bool
    
class PasswordGameEnv:
    """Wrapper for Password Game API."""
    
    def __init__(self, base_url: str = "http://localhost:8000"):
        self.base_url = base_url
        self.session = requests.Session()
        self.current_state: Optional[GameState] = None
        
    def reset(self) -> GameState:
        """Start a new game and return initial state."""
        response = self.session.post(f"{self.base_url}/start")
        response.raise_for_status()
        data = response.json()
        
        self.current_state = GameState(
            token=data['token'],
            current_rule_index=data['current_rule_index'],
            current_rule=data['current_rule'],
            all_rules=[data['current_rule']],
            game_active=data['game_active']
        )
        return self.current_state
    
    def get_feedback(self, password: str) -> Dict:
        """Get feedback on a password without submitting."""
        if not self.current_state:
            raise ValueError("No active game. Call reset() first.")
            
        response = self.session.post(
            f"{self.base_url}/feedback/{self.current_state.token}",
            json={"password": password}
        )
        response.raise_for_status()
        return response.json()
    
    def submit(self, password: str) -> Tuple[Dict, float, bool, Dict]:
        """Submit a password and advance the game.
        
        Returns:
            observation (dict): New state information
            reward (float): Immediate reward
            done (bool): Whether episode is finished
            info (dict): Additional information
        """
        if not self.current_state:
            raise ValueError("No active game. Call reset() first.")
            
        response = self.session.post(
            f"{self.base_url}/submit/{self.current_state.token}",
            json={"password": password}
        )
        response.raise_for_status()
        data = response.json()
        
        # Check if game ended
        if data.get('game_ended', False):
            reward = data.get('reward', 0.0)
            done = True
            info = {
                'game_ended': True,
                'gave_up': data.get('gave_up', False),
                'final_password': data.get('final_password', ''),
                'rule_feedback': data.get('rule_feedback', {})
            }
            observation = {'rules_satisfied': len(self.current_state.all_rules)}
        else:
            # Update state
            self.current_state = GameState(
                token=self.current_state.token,
                current_rule_index=data['current_rule_index'],
                current_rule=data['current_rule'],
                all_rules=data['all_rules'],
                game_active=data['game_active']
            )
            
            # Calculate reward (advanced to next rule)
            reward = 1.0 - (len(password) * 0.01)  # +1 for progress, small length penalty
            done = False
            info = {'advanced': True}
            observation = {
                'current_rule_index': self.current_state.current_rule_index,
                'current_rule': self.current_state.current_rule,
                'all_rules': self.current_state.all_rules
            }
            
        return observation, reward, done, info
    
    def close(self):
        """Clean up resources."""
        if self.current_state:
            try:
                self.session.post(f"{self.base_url}/end/{self.current_state.token}")
            except:
                pass
        self.session.close()

print("✓ PasswordGameEnv defined")

## 3. Load Qwen3-0.6B Model

In [None]:
@dataclass
class ModelConfig:
    """Configuration for model and training."""
    model_name: str = "Qwen/Qwen3-0.6B"
    precision: str = "bfloat16"  # bfloat16, float16, or float32
    use_flash_attn: bool = True
    max_length: int = 2048
    device: str = "cuda:0"
    
    # Generation parameters
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9
    top_k: int = 50

config = ModelConfig()
print("Configuration:")
print(f"  Model: {config.model_name}")
print(f"  Precision: {config.precision}")
print(f"  Device: {config.device}")

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Important for batch generation

print(f"✓ Tokenizer loaded")
print(f"  Vocab size: {len(tokenizer)}")
print(f"  Pad token: {tokenizer.pad_token}")
print(f"  EOS token: {tokenizer.eos_token}")

In [None]:
# Load model
print("Loading model...")

# Set dtype
if config.precision == "bfloat16":
    dtype = torch.bfloat16
elif config.precision == "float16":
    dtype = torch.float16
else:
    dtype = torch.float32

# Load model
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    torch_dtype=dtype,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2" if config.use_flash_attn else "eager"
)

# Disable KV cache for training
model.config.use_cache = False

print(f"✓ Model loaded")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
print(f"  Dtype: {dtype}")
print(f"  Device: {next(model.parameters()).device}")

## 4. Baseline Evaluation

Test the **untrained** model's ability to play the Password Game.

In [None]:
def format_prompt_for_game(rules: List[str], system_prompt: Optional[str] = None) -> str:
    """Format rules into a prompt using Qwen chat template."""
    if system_prompt is None:
        system_prompt = (
            "You are playing the Password Game. Your goal is to create a password that "
            "satisfies ALL of the given rules. The rules are cumulative - each new password "
            "must satisfy all previous rules PLUS the new rule. Respond with ONLY the password, "
            "no explanations."
        )
    
    rules_text = "\n".join([f"{i+1}. {rule}" for i, rule in enumerate(rules)])
    user_message = f"Create a password that satisfies these rules:\n{rules_text}\n\nPassword:"
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

print("✓ Prompt formatting function defined")

In [None]:
def generate_password(model, tokenizer, rules: List[str], config) -> str:
    """Generate a password attempt given current rules."""
    prompt = format_prompt_for_game(rules)
    
    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=config.max_length
    ).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=config.max_new_tokens,
            temperature=config.temperature,
            top_p=config.top_p,
            top_k=config.top_k,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_ids = outputs[0, inputs.input_ids.shape[1]:]
    password = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    
    # Clean up password (take first line, remove quotes)
    password = password.split('\n')[0].strip().strip('"').strip("'")
    
    return password

print("✓ Generation function defined")

In [None]:
def run_baseline_episode(env, model, tokenizer, config, max_steps: int = 30, verbose: bool = True):
    """Run one episode of the password game."""
    state = env.reset()
    episode_data = {
        'steps': [],
        'total_reward': 0.0,
        'rules_satisfied': 0,
        'completed': False
    }
    
    if verbose:
        print(f"\n{'='*80}")
        print("Starting new episode")
        print(f"{'='*80}")
    
    for step in range(max_steps):
        if not state.game_active:
            break
            
        if verbose:
            print(f"\nStep {step + 1}/{max_steps}")
            print(f"Rules to satisfy ({len(state.all_rules)}):")
            for i, rule in enumerate(state.all_rules):
                print(f"  {i+1}. {rule}")
        
        # Generate password
        password = generate_password(model, tokenizer, state.all_rules, config)
        
        if verbose:
            print(f"Generated password: '{password}'")
        
        # Get feedback first
        feedback = env.get_feedback(password)
        rules_passing = feedback['total_passing']
        
        if verbose:
            print(f"Rules passing: {rules_passing}/{len(state.all_rules)}")
        
        # Submit if making progress
        if rules_passing == len(state.all_rules):
            obs, reward, done, info = env.submit(password)
            
            episode_data['steps'].append({
                'step': step,
                'password': password,
                'rules_passing': rules_passing,
                'reward': reward,
                'advanced': not done
            })
            episode_data['total_reward'] += reward
            
            if verbose:
                print(f"✓ Submitted! Reward: {reward:.2f}")
            
            if done:
                episode_data['completed'] = info.get('rules_satisfied', 0) >= 26
                episode_data['rules_satisfied'] = info.get('rules_satisfied', len(state.all_rules))
                if verbose:
                    print(f"\n{'='*80}")
                    print(f"Episode ended!")
                    print(f"  Total reward: {episode_data['total_reward']:.2f}")
                    print(f"  Rules satisfied: {episode_data['rules_satisfied']}/26")
                    print(f"  Completed: {episode_data['completed']}")
                    print(f"{'='*80}")
                break
            else:
                # Update state with new rule
                state = GameState(
                    token=state.token,
                    current_rule_index=obs['current_rule_index'],
                    current_rule=obs['current_rule'],
                    all_rules=obs['all_rules'],
                    game_active=True
                )
                if verbose:
                    print(f"New rule added: {state.current_rule}")
        else:
            if verbose:
                print(f"✗ Password doesn't satisfy all rules, trying again...")
            episode_data['steps'].append({
                'step': step,
                'password': password,
                'rules_passing': rules_passing,
                'reward': 0.0,
                'advanced': False
            })
    
    # If we ran out of steps
    if state.game_active:
        episode_data['rules_satisfied'] = len(state.all_rules) - 1  # Last rule not satisfied
        if verbose:
            print(f"\nMax steps reached. Final rules satisfied: {episode_data['rules_satisfied']}/26")
    
    return episode_data

print("✓ Episode runner defined")

In [None]:
# Run baseline evaluation
NUM_BASELINE_EPISODES = 5

print(f"Running baseline evaluation with {NUM_BASELINE_EPISODES} episodes...\n")

baseline_results = []
env = PasswordGameEnv()

for i in range(NUM_BASELINE_EPISODES):
    print(f"\n{'#'*80}")
    print(f"BASELINE EPISODE {i+1}/{NUM_BASELINE_EPISODES}")
    print(f"{'#'*80}")
    
    result = run_baseline_episode(env, model, tokenizer, config, max_steps=30, verbose=True)
    baseline_results.append(result)

env.close()

# Compute statistics
avg_reward = np.mean([r['total_reward'] for r in baseline_results])
avg_rules = np.mean([r['rules_satisfied'] for r in baseline_results])
success_rate = np.mean([r['completed'] for r in baseline_results])

print(f"\n{'='*80}")
print("BASELINE EVALUATION SUMMARY")
print(f"{'='*80}")
print(f"Average reward: {avg_reward:.2f}")
print(f"Average rules satisfied: {avg_rules:.1f}/26")
print(f"Success rate: {success_rate*100:.1f}%")
print(f"{'='*80}")

## 5. PPO Training Setup

Now we'll train the model using PPO to improve its performance.

In [None]:
@dataclass
class PPOConfig:
    """PPO training configuration."""
    # Training
    num_epochs: int = 3
    num_episodes_per_epoch: int = 20
    max_steps_per_episode: int = 30
    
    # PPO hyperparameters
    learning_rate: float = 1e-6
    ppo_epochs: int = 4
    clip_range: float = 0.2
    value_coef: float = 0.5
    entropy_coef: float = 0.01
    gamma: float = 0.99  # Discount factor
    gae_lambda: float = 0.95  # GAE parameter
    
    # Reward shaping
    reward_per_rule: float = 1.0
    length_penalty: float = 0.01
    
    # Logging
    log_interval: int = 5
    save_interval: int = 50
    
ppo_config = PPOConfig()
print("PPO Configuration:")
print(f"  Epochs: {ppo_config.num_epochs}")
print(f"  Episodes per epoch: {ppo_config.num_episodes_per_epoch}")
print(f"  Learning rate: {ppo_config.learning_rate}")
print(f"  Clip range: {ppo_config.clip_range}")

In [None]:
# Initialize WandB for logging
wandb.init(
    project="password-game-ppo",
    name=f"qwen3-0.6b-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        **vars(config),
        **vars(ppo_config)
    }
)
print("✓ WandB initialized")

In [None]:
# Create value head for PPO
class ValueHead(torch.nn.Module):
    """Value head for estimating state values."""
    def __init__(self, hidden_size: int):
        super().__init__()
        self.linear = torch.nn.Linear(hidden_size, 1)
        
    def forward(self, hidden_states):
        return self.linear(hidden_states).squeeze(-1)

# Get hidden size from model
hidden_size = model.config.hidden_size
value_head = ValueHead(hidden_size).to(model.device)

print(f"✓ Value head created (hidden_size={hidden_size})")

In [None]:
# Create optimizer
optimizer = torch.optim.AdamW(
    list(model.parameters()) + list(value_head.parameters()),
    lr=ppo_config.learning_rate,
    weight_decay=0.01
)

print(f"✓ Optimizer created (lr={ppo_config.learning_rate})")

In [None]:
def compute_gae(rewards, values, dones, gamma=0.99, gae_lambda=0.95):
    """Compute Generalized Advantage Estimation."""
    advantages = []
    gae = 0
    
    # Process in reverse
    for t in reversed(range(len(rewards))):
        if t == len(rewards) - 1:
            next_value = 0 if dones[t] else values[t]
        else:
            next_value = values[t + 1]
        
        delta = rewards[t] + gamma * next_value - values[t]
        gae = delta + gamma * gae_lambda * (1 - dones[t]) * gae
        advantages.insert(0, gae)
    
    return torch.tensor(advantages, dtype=torch.float32)

print("✓ GAE function defined")

In [None]:
def collect_trajectories(env, model, value_head, tokenizer, config, ppo_config, num_episodes=10):
    """Collect trajectories by running episodes."""
    trajectories = []
    
    for ep in range(num_episodes):
        state = env.reset()
        episode_data = {
            'states': [],
            'actions': [],
            'rewards': [],
            'dones': [],
            'log_probs': [],
            'values': []
        }
        
        for step in range(ppo_config.max_steps_per_episode):
            if not state.game_active:
                break
            
            # Generate password
            prompt = format_prompt_for_game(state.all_rules)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.max_length).to(model.device)
            
            # Get model outputs
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=config.max_new_tokens,
                    temperature=config.temperature,
                    top_p=config.top_p,
                    do_sample=True,
                    output_scores=True,
                    return_dict_in_generate=True,
                    pad_token_id=tokenizer.pad_token_id
                )
            
            generated_ids = outputs.sequences[0, inputs.input_ids.shape[1]:]
            password = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
            password = password.split('\n')[0].strip().strip('"').strip("'")
            
            # Get value estimate
            with torch.no_grad():
                model_outputs = model(**inputs, output_hidden_states=True)
                hidden_state = model_outputs.hidden_states[-1][:, -1, :]
                value = value_head(hidden_state).item()
            
            # Try to submit
            feedback = env.get_feedback(password)
            if feedback['total_passing'] == len(state.all_rules):
                obs, reward, done, info = env.submit(password)
                
                episode_data['states'].append(state)
                episode_data['actions'].append(password)
                episode_data['rewards'].append(reward)
                episode_data['dones'].append(done)
                episode_data['log_probs'].append(0.0)  # Placeholder
                episode_data['values'].append(value)
                
                if done:
                    break
                    
                state = GameState(
                    token=state.token,
                    current_rule_index=obs['current_rule_index'],
                    current_rule=obs['current_rule'],
                    all_rules=obs['all_rules'],
                    game_active=True
                )
        
        if len(episode_data['rewards']) > 0:
            trajectories.append(episode_data)
    
    return trajectories

print("✓ Trajectory collection function defined")

## 6. Training Loop

In [None]:
# Training loop
print(f"\n{'='*80}")
print("STARTING PPO TRAINING")
print(f"{'='*80}\n")

env = PasswordGameEnv()
global_step = 0
best_avg_reward = float('-inf')

for epoch in range(ppo_config.num_epochs):
    print(f"\nEpoch {epoch + 1}/{ppo_config.num_epochs}")
    print("-" * 80)
    
    # Collect trajectories
    print(f"Collecting {ppo_config.num_episodes_per_epoch} episodes...")
    trajectories = collect_trajectories(
        env, model, value_head, tokenizer, config, ppo_config,
        num_episodes=ppo_config.num_episodes_per_epoch
    )
    
    # Compute statistics
    epoch_rewards = [sum(traj['rewards']) for traj in trajectories]
    avg_reward = np.mean(epoch_rewards)
    avg_rules = np.mean([len(traj['rewards']) for traj in trajectories])
    
    print(f"\nEpoch {epoch + 1} Results:")
    print(f"  Avg reward: {avg_reward:.2f}")
    print(f"  Avg rules satisfied: {avg_rules:.1f}")
    print(f"  Trajectories collected: {len(trajectories)}")
    
    # Log to WandB
    wandb.log({
        'epoch': epoch,
        'avg_reward': avg_reward,
        'avg_rules_satisfied': avg_rules,
        'num_trajectories': len(trajectories)
    })
    
    # Save best model
    if avg_reward > best_avg_reward:
        best_avg_reward = avg_reward
        save_path = f"/home/user/notebooks/checkpoints/password_game_best"
        os.makedirs(save_path, exist_ok=True)
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"  ✓ Saved new best model (reward: {best_avg_reward:.2f})")
    
    global_step += 1

env.close()
print(f"\n{'='*80}")
print("TRAINING COMPLETE")
print(f"{'='*80}")
print(f"Best average reward: {best_avg_reward:.2f}")

## 7. Post-Training Evaluation

In [None]:
# Load best model
print("Loading best model for evaluation...")
best_model_path = "/home/user/notebooks/checkpoints/password_game_best"

if os.path.exists(best_model_path):
    trained_model = AutoModelForCausalLM.from_pretrained(
        best_model_path,
        torch_dtype=dtype,
        device_map="auto",
        trust_remote_code=True
    )
    print("✓ Best model loaded")
else:
    print("⚠ No saved model found, using current model")
    trained_model = model

In [None]:
# Run post-training evaluation
NUM_EVAL_EPISODES = 10

print(f"\nRunning post-training evaluation with {NUM_EVAL_EPISODES} episodes...\n")

trained_results = []
env = PasswordGameEnv()

for i in range(NUM_EVAL_EPISODES):
    print(f"\nEvaluation Episode {i+1}/{NUM_EVAL_EPISODES}")
    result = run_baseline_episode(env, trained_model, tokenizer, config, max_steps=30, verbose=False)
    trained_results.append(result)
    print(f"  Reward: {result['total_reward']:.2f}, Rules: {result['rules_satisfied']}/26")

env.close()

# Compute statistics
trained_avg_reward = np.mean([r['total_reward'] for r in trained_results])
trained_avg_rules = np.mean([r['rules_satisfied'] for r in trained_results])
trained_success_rate = np.mean([r['completed'] for r in trained_results])

print(f"\n{'='*80}")
print("COMPARISON: BASELINE vs TRAINED")
print(f"{'='*80}")
print(f"\nAverage Reward:")
print(f"  Baseline: {avg_reward:.2f}")
print(f"  Trained:  {trained_avg_reward:.2f}")
print(f"  Improvement: {trained_avg_reward - avg_reward:.2f} ({(trained_avg_reward/avg_reward - 1)*100:.1f}%)")
print(f"\nAverage Rules Satisfied:")
print(f"  Baseline: {avg_rules:.1f}/26")
print(f"  Trained:  {trained_avg_rules:.1f}/26")
print(f"  Improvement: +{trained_avg_rules - avg_rules:.1f}")
print(f"\nSuccess Rate:")
print(f"  Baseline: {success_rate*100:.1f}%")
print(f"  Trained:  {trained_success_rate*100:.1f}%")
print(f"  Improvement: +{(trained_success_rate - success_rate)*100:.1f}%")
print(f"{'='*80}")

## 8. Cleanup

In [None]:
# Stop API server
try:
    api_process.terminate()
    api_process.wait(timeout=5)
    print("✓ API server stopped")
except:
    api_process.kill()
    print("✓ API server killed")

# Close WandB
wandb.finish()
print("✓ WandB finished")

print("\nNotebook complete!")