# Poker Agent Model Comparison

This notebook compares different approaches to playing heads-up Texas Hold'em Poker:
- MA-DVRL (Multi-Agent Deep Variational Reinforcement Learning)
- CFR+ (Counterfactual Regret Minimization Plus)
- PPO (Proximal Policy Optimization)
- Policy Gradient

## Setup

In [None]:
import torch
import numpy as np
from tqdm.notebook import tqdm
import wandb
from datetime import datetime
import matplotlib.pyplot as plt

from environments.poker_env import PokerEnv
from models.poker_dvrl import PokerMADVRL
from models.cfr_plus import CFRPlusTrainer
from models.ppo import PPOTrainer
from models.policy_gradient import PolicyGradientTrainer
from utils.plotting import (
    plot_training_curves,
    plot_model_comparison,
    plot_win_rates,
    plot_learning_curves_comparison
)

## Configuration

In [None]:
# Training configuration
config = {
    'num_episodes': 10000,
    'eval_episodes': 1000,
    'batch_size': 32,
    'learning_rate': 3e-4,
    'gamma': 0.99,
    'card_dim': 32,
    'belief_dim': 256,
    'hidden_dim': 128,
    'num_heads': 4,
    'initial_chips': 1000,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Initialize wandb
wandb.init(
    project='poker-comparison',
    config=config,
    name=f'model_comparison_{datetime.now():%Y%m%d_%H%M%S}'
)

## Training Functions

In [None]:
def train_dvrl(config):
    """Train MA-DVRL model."""
    env = PokerEnv(initial_chips=config['initial_chips'])
    model = PokerMADVRL(
        card_dim=config['card_dim'],
        belief_dim=config['belief_dim'],
        hidden_dim=config['hidden_dim'],
        num_heads=config['num_heads']
    ).to(config['device'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    history = {'reward': [], 'loss': []}
    
    for episode in tqdm(range(config['num_episodes']), desc='Training MA-DVRL'):
        episode_reward, metrics = train_dvrl_episode(env, model, optimizer, config)
        history['reward'].append(episode_reward)
        history['loss'].append(metrics['loss'])
        
    return model, history

def train_cfr(config):
    """Train CFR+ model."""
    env = PokerEnv(initial_chips=config['initial_chips'])
    trainer = CFRPlusTrainer()
    history = {'reward': [], 'regret_sum': []}
    
    for episode in tqdm(range(config['num_episodes']), desc='Training CFR+'):
        reward, metrics = trainer.train_iteration(env)
        history['reward'].append(reward)
        history['regret_sum'].append(metrics['regret_sum'])
        
    return trainer, history

def train_ppo(config):
    """Train PPO model."""
    env = PokerEnv(initial_chips=config['initial_chips'])
    trainer = PPOTrainer(
        input_dim=env.observation_space.shape[0],
        hidden_dim=config['hidden_dim'],
        device=config['device']
    )
    history = {'reward': [], 'policy_loss': [], 'value_loss': []}
    
    for episode in tqdm(range(config['num_episodes']), desc='Training PPO'):
        episode_reward, metrics = train_ppo_episode(env, trainer, config)
        history['reward'].append(episode_reward)
        history['policy_loss'].append(metrics['policy_loss'])
        history['value_loss'].append(metrics['value_loss'])
        
    return trainer, history

def train_pg(config):
    """Train Policy Gradient model."""
    env = PokerEnv(initial_chips=config['initial_chips'])
    trainer = PolicyGradientTrainer(
        input_dim=env.observation_space.shape[0],
        hidden_dim=config['hidden_dim'],
        device=config['device']
    )
    history = {'reward': [], 'policy_loss': []}
    
    for episode in tqdm(range(config['num_episodes']), desc='Training PG'):
        episode_reward, metrics = train_pg_episode(env, trainer, config)
        history['reward'].append(episode_reward)
        history['policy_loss'].append(metrics['policy_loss'])
        
    return trainer, history

## Evaluation Functions

In [None]:
def evaluate_model(model, env, num_episodes=1000):
    """Evaluate model against random opponent."""
    rewards = []
    wins = 0
    
    for _ in tqdm(range(num_episodes), desc='Evaluating'):
        obs = env.reset()
        done = False
        episode_reward = 0
        
        while not done:
            # Model plays as player 0
            if env.current_player == 0:
                action = model.get_action(obs)
            else:
                # Random opponent
                valid_actions = obs[1]['valid_actions']
                valid_indices = np.where(valid_actions == 1)[0]
                action = np.random.choice(valid_indices)
            
            obs, reward, done, _ = env.step(action)
            episode_reward += reward[0]
            
        rewards.append(episode_reward)
        wins += episode_reward > 0
    
    return {
        'mean_reward': np.mean(rewards),
        'std_reward': np.std(rewards),
        'win_rate': wins / num_episodes
    }

## Train Models

In [None]:
# Train all models
models = {}
histories = {}

print('Training MA-DVRL...')
models['MA-DVRL'], histories['MA-DVRL'] = train_dvrl(config)

print('\nTraining CFR+...')
models['CFR+'], histories['CFR+'] = train_cfr(config)

print('\nTraining PPO...')
models['PPO'], histories['PPO'] = train_ppo(config)

print('\nTraining Policy Gradient...')
models['PG'], histories['PG'] = train_pg(config)

## Compare Results

In [None]:
# Evaluate all models
env = PokerEnv(initial_chips=config['initial_chips'])
results = {}
win_rates = {}

for name, model in models.items():
    print(f'\nEvaluating {name}...')
    eval_metrics = evaluate_model(model, env, config['eval_episodes'])
    results[name] = eval_metrics
    win_rates[name] = eval_metrics['win_rate']

# Plot learning curves
fig = plot_learning_curves_comparison(
    histories,
    metric='reward',
    title='Learning Curves Comparison'
)
wandb.log({'learning_curves': wandb.Image(fig)})
plt.show()

# Plot model comparison
fig = plot_model_comparison(
    results,
    metrics=['mean_reward', 'win_rate'],
    title='Model Performance Comparison'
)
wandb.log({'model_comparison': wandb.Image(fig)})
plt.show()

# Plot win rates
fig = plot_win_rates(win_rates)
wandb.log({'win_rates': wandb.Image(fig)})
plt.show()

wandb.finish()

## Analysis

Let's analyze the results of our comparison:

1. **Learning Speed**
   - Which algorithm learned fastest?
   - Were there any significant plateaus in learning?

2. **Final Performance**
   - Which algorithm achieved the highest mean reward?
   - Which algorithm had the most consistent performance (lowest std)?
   - How do the win rates compare?

3. **Strengths and Weaknesses**
   - MA-DVRL: How well did it handle partial observability?
   - CFR+: Did it find Nash equilibrium strategies?
   - PPO: How stable was the learning process?
   - Policy Gradient: Did it suffer from high variance?

4. **Practical Considerations**
   - Training time and computational requirements
   - Ease of implementation and tuning
   - Scalability to larger game variants