# Multi-Agent Deep Variational Reinforcement Learning for Poker

This notebook implements and trains a MA-DVRL model for playing heads-up Texas Hold'em Poker. The model combines variational inference with deep reinforcement learning to handle partial observability and opponent modeling in a competitive setting.

## Setup
First, let's import the required packages:

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import wandb
from tqdm.notebook import tqdm
from datetime import datetime

from environments.poker_env import PokerEnv
from models.poker_dvrl import PokerMADVRL

## Configuration
Set up training parameters and initialize Weights & Biases for experiment tracking:

In [None]:
# Training configuration
config = {
    'num_episodes': 10000,
    'batch_size': 32,
    'learning_rate': 3e-4,
    'gamma': 0.99,
    'card_dim': 32,
    'belief_dim': 256,
    'hidden_dim': 128,
    'num_heads': 4,
    'initial_chips': 1000,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Initialize wandb
use_wandb = True  # Set to False to disable W&B logging
if use_wandb:
    wandb.init(
        project='poker-dvrl',
        config=config,
        name=f'poker_dvrl_{datetime.now():%Y%m%d_%H%M%S}'
    )

## Helper Functions
Define utility functions for training and evaluation:

In [None]:
def convert_to_tensor(obs_dict, device):
    """Convert numpy observations to PyTorch tensors."""
    return {
        i: {
            k: torch.as_tensor(v, device=device)
            for k, v in obs.items()
        }
        for i, obs in obs_dict.items()
    }

def evaluate_model(model, env, config, num_episodes=100):
    """Evaluate model performance."""
    model.eval()
    rewards = []
    
    for _ in tqdm(range(num_episodes), desc='Evaluating'):
        obs = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            # Model action for player 0
            obs_tensor = convert_to_tensor(obs, config['device'])
            with torch.no_grad():
                actions_prob, _ = model(obs_tensor)
                action_0 = torch.argmax(actions_prob[0]).item()
            
            # Random action for player 1
            valid_actions = obs[1]['valid_actions']
            valid_indices = np.where(valid_actions == 1)[0]
            action_1 = np.random.choice(valid_indices)
            
            # Take actions
            obs, rewards, dones, _ = env.step({0: action_0, 1: action_1})
            episode_reward += rewards[0]  # Track rewards for player 0
            done = any(dones.values())
        
        rewards.append(episode_reward)
    
    model.train()
    return np.mean(rewards), np.std(rewards)

## Initialize Environment and Model

In [None]:
# Initialize environment and model
env = PokerEnv(initial_chips=config['initial_chips'])
model = PokerMADVRL(
    card_dim=config['card_dim'],
    belief_dim=config['belief_dim'],
    hidden_dim=config['hidden_dim'],
    num_heads=config['num_heads']
).to(config['device'])

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

## Training Loop
Train the model with logging and visualization:

In [None]:
# Training metrics
episode_rewards = deque(maxlen=100)
best_reward = float('-inf')

# Training loop
progress_bar = tqdm(range(config['num_episodes']), desc='Training')
for episode in progress_bar:
    obs = env.reset()
    episode_reward = 0
    episode_loss = 0
    num_steps = 0
    done = False
    
    while not done:
        # Convert observations to tensors
        obs_tensor = convert_to_tensor(obs, config['device'])
        
        # Get model predictions
        with torch.no_grad():
            actions_prob, opponent_preds = model(obs_tensor)
            
            # Sample actions from the policy
            actions = {
                i: torch.multinomial(probs, 1).item()
                for i, probs in actions_prob.items()
            }
        
        # Take actions in the environment
        next_obs, rewards, dones, _ = env.step(actions)
        done = any(dones.values())
        
        # Convert everything to tensors for training
        next_obs_tensor = convert_to_tensor(next_obs, config['device'])
        actions_tensor = {
            i: F.one_hot(torch.tensor([a], device=config['device']), 4).float()
            for i, a in actions.items()
        }
        rewards_tensor = {
            i: torch.tensor([r], device=config['device']).float()
            for i, r in rewards.items()
        }
        dones_tensor = {
            i: torch.tensor([d], device=config['device']).float()
            for i, d in dones.items()
        }
        
        # Compute loss and update model
        loss = model.get_loss(
            obs_tensor,
            actions_tensor,
            rewards_tensor,
            next_obs_tensor,
            dones_tensor,
            config['gamma']
        )
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update metrics
        episode_reward += sum(rewards.values())
        episode_loss += loss.item()
        num_steps += 1
        obs = next_obs
    
    # Log episode metrics
    episode_rewards.append(episode_reward)
    avg_reward = sum(episode_rewards) / len(episode_rewards)
    avg_loss = episode_loss / num_steps
    
    # Update progress bar
    progress_bar.set_postfix({
        'avg_reward': f'{avg_reward:.2f}',
        'loss': f'{avg_loss:.4f}'
    })
    
    # Log to wandb if enabled
    if use_wandb:
        wandb.log({
            'episode': episode,
            'reward': episode_reward,
            'avg_reward': avg_reward,
            'loss': avg_loss,
            'steps': num_steps
        })
    
    # Save best model
    if avg_reward > best_reward:
        best_reward = avg_reward
        torch.save(
            {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'episode': episode,
                'best_reward': best_reward,
                'config': config
            },
            'poker_dvrl_best.pt'
        )
        if use_wandb:
            wandb.save('poker_dvrl_best.pt')

## Evaluate Model
Test the trained model against a random opponent:

In [None]:
# Evaluate the best model
checkpoint = torch.load('poker_dvrl_best.pt')
model.load_state_dict(checkpoint['model_state_dict'])
mean_reward, std_reward = evaluate_model(model, env, config)

print(f'Evaluation Results:')
print(f'Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}')

if use_wandb:
    wandb.finish()