In [2]:
# Run setup from config notebook
%run 0_config_setup.ipynb

Collecting torch
  Using cached torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torch
  Using cached torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Using cach

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from tqdm import tqdm
import json
import wandb
from dataclasses import dataclass
import os
import random

set_seed(SEED)

print("🚀 Reward Model Training - HPC Optimized")
print(f"   Batch size: {RM_BATCH_SIZE}")
print(f"   Learning rate: {RM_LEARNING_RATE}")
print(f"   Epochs: {RM_EPOCHS}")

🚀 Reward Model Training - HPC Optimized
   Batch size: 16
   Learning rate: 1.5e-05
   Epochs: 3


## Load Synthetic Preference Data

In [4]:
# Load English and French preference files
print("Loading synthetic preference data...")
print(f"  EN: {OUTPUTS_DIR / 'en-ar-preferences.jsonl'}")
print(f"  FR: {OUTPUTS_DIR / 'fr-ar-preferences.jsonl'}")

preference_data = []

# Load English preferences
en_pref_file = OUTPUTS_DIR / "en-ar-preferences.jsonl"
if en_pref_file.exists():
    with open(en_pref_file, 'r', encoding='utf-8') as f:
        en_count = 0
        for line in f:
            preference_data.append(json.loads(line))
            en_count += 1
    print(f"✓ Loaded {en_count} English preference pairs")
else:
    print(f"⚠️  EN preferences file not found: {en_pref_file}")

# Load French preferences
fr_pref_file = OUTPUTS_DIR / "fr-ar-preferences.jsonl"
if fr_pref_file.exists():
    with open(fr_pref_file, 'r', encoding='utf-8') as f:
        fr_count = 0
        for line in f:
            preference_data.append(json.loads(line))
            fr_count += 1
    print(f"✓ Loaded {fr_count} French preference pairs")
else:
    print(f"⚠️  FR preferences file not found: {fr_pref_file}")

print(f"\nTotal preference pairs (EN + FR): {len(preference_data)}")

if len(preference_data) == 0:
    raise ValueError("No preference data loaded! Check file paths.")

# Split into train/validation (mixed batches from both languages)
train_size = int(0.9 * len(preference_data))
train_data = preference_data[:train_size]
val_data = preference_data[train_size:]

print(f"Train: {len(train_data)} pairs")
print(f"Validation: {len(val_data)} pairs")

# Language breakdown
en_train = sum(1 for item in train_data if item.get('source_lang') == 'en')
fr_train = sum(1 for item in train_data if item.get('source_lang') == 'fr')
en_val = sum(1 for item in val_data if item.get('source_lang') == 'en')
fr_val = sum(1 for item in val_data if item.get('source_lang') == 'fr')

print(f"\nLanguage breakdown (train):")
print(f"  English: {en_train} ({100*en_train/len(train_data):.1f}%)")
print(f"  French: {fr_train} ({100*fr_train/len(train_data):.1f}%)")
print(f"\nLanguage breakdown (validation):")
print(f"  English: {en_val} ({100*en_val/len(val_data):.1f}%)")
print(f"  French: {fr_val} ({100*fr_val/len(val_data):.1f}%)")

Loading synthetic preference data...
  EN: outputs/en-ar-preferences.jsonl
  FR: outputs/fr-ar-preferences.jsonl
✓ Loaded 44324 English preference pairs
✓ Loaded 46908 French preference pairs

Total preference pairs (EN + FR): 91232
Train: 82108 pairs
Validation: 9124 pairs

Language breakdown (train):
  English: 44324 (54.0%)
  French: 37784 (46.0%)

Language breakdown (validation):
  English: 0 (0.0%)
  French: 9124 (100.0%)
✓ Loaded 46908 French preference pairs

Total preference pairs (EN + FR): 91232
Train: 82108 pairs
Validation: 9124 pairs

Language breakdown (train):
  English: 44324 (54.0%)
  French: 37784 (46.0%)

Language breakdown (validation):
  English: 0 (0.0%)
  French: 9124 (100.0%)


In [5]:
# Validate preference data quality
print("\nValidating preference data quality...")
sample_margins = [item['margin'] for item in preference_data]
print(f"Margin statistics (all {len(sample_margins)} samples):")
print(f"  Min: {min(sample_margins):.4f}")
print(f"  Max: {max(sample_margins):.4f}")
mean_margin = sum(sample_margins)/len(sample_margins)
print(f"  Mean: {mean_margin:.4f}")
if len(sample_margins) > 1:
    std_margin = (sum((x - mean_margin)**2 for x in sample_margins)/len(sample_margins))**0.5
    print(f"  Std: {std_margin:.4f}")

# Check for degenerate cases
zero_margin_count = sum(1 for item in preference_data if item['margin'] < 0.01)
print(f"  Samples with margin < 0.01: {zero_margin_count}/{len(preference_data)}")
if zero_margin_count > len(preference_data) * 0.5:
    print("  ⚠️  WARNING: >50% of samples have near-zero margins (weak preference signal)")

# Quality by language
if any('source_lang' in item for item in preference_data):
    en_margins = [item['margin'] for item in preference_data if item.get('source_lang') == 'en']
    fr_margins = [item['margin'] for item in preference_data if item.get('source_lang') == 'fr']
    
    if en_margins:
        en_mean = sum(en_margins)/len(en_margins)
        print(f"\nEnglish margin stats (n={len(en_margins)}): mean={en_mean:.4f}")
    if fr_margins:
        fr_mean = sum(fr_margins)/len(fr_margins)
        print(f"French margin stats (n={len(fr_margins)}): mean={fr_mean:.4f}")


Validating preference data quality...
Margin statistics (all 91232 samples):
  Min: 0.0100
  Max: 0.7900
  Mean: 0.0633
  Std: 0.0505
  Samples with margin < 0.01: 0/91232

English margin stats (n=44324): mean=0.0549
French margin stats (n=46908): mean=0.0712


## Preference Dataset

In [6]:
class PreferenceDataset(Dataset):
    """Dataset for pairwise preference data"""
    
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Format: "Source: ... \nTranslation: ..."
        chosen_text = f"Source: {item['source']}\nTranslation: {item['chosen']}"
        rejected_text = f"Source: {item['source']}\nTranslation: {item['rejected']}"
        
        # Tokenize
        chosen_tokens = self.tokenizer(
            chosen_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        rejected_tokens = self.tokenizer(
            rejected_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'chosen_input_ids': chosen_tokens['input_ids'].squeeze(0),
            'chosen_attention_mask': chosen_tokens['attention_mask'].squeeze(0),
            'rejected_input_ids': rejected_tokens['input_ids'].squeeze(0),
            'rejected_attention_mask': rejected_tokens['attention_mask'].squeeze(0),
            'margin': item['margin']  # For analysis
        }

print("PreferenceDataset class defined (max_length=512)")

PreferenceDataset class defined (max_length=512)


## Reward Model Architecture

In [7]:
class RewardModel(nn.Module):
    """Reward model with base LM + reward head"""
    
    def __init__(self, base_model, hidden_dim=256, head_type='mlp', dropout=0.15):
        super().__init__()
        self.base_model = base_model
        self.head_type = head_type
        
        # Get hidden size from base model
        self.hidden_size = base_model.config.hidden_size
        
        # Reward head
        if head_type == 'linear':
            self.reward_head = nn.Linear(self.hidden_size, 1)
        elif head_type == 'mlp':
            self.reward_head = nn.Sequential(
                nn.Linear(self.hidden_size, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, 1)
            )
        else:
            raise ValueError(f"Unknown head_type: {head_type}")
        
        # Initialize reward head with larger output scale
        with torch.no_grad():
            if head_type == 'mlp':
                # Scale final layer to produce larger reward values
                self.reward_head[-1].weight.mul_(2.0)
                self.reward_head[-1].bias.mul_(2.0)
    
    def forward(self, input_ids, attention_mask):
        # Get base model outputs with hidden states
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,  # Need hidden states for reward computation
            use_cache=False  # Disable cache to avoid device placement issues with multi-GPU
        )
        
        # Get last hidden state from hidden_states tuple
        # hidden_states is a tuple of all layer outputs, last one is what we need
        hidden_states = outputs.hidden_states[-1]  # [batch, seq_len, hidden_size]
        
        # Pool: use last token representation (similar to value head in PPO)
        # Get the last non-padding token for each sequence
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = hidden_states.shape[0]
        pooled = hidden_states[torch.arange(batch_size), sequence_lengths]
        
        # Apply reward head
        reward = self.reward_head(pooled)  # [batch, 1]
        
        return reward.squeeze(-1)  # [batch]

print("RewardModel class defined")

RewardModel class defined


## Load Base Model and Create Reward Model

In [8]:
print(f"Loading base model: {REWARD_BASE_MODEL}...")
print("🚀 HPC Optimized loading\n")

# Ensure HF authentication
from huggingface_hub import login
try:
    # Try to login with cached token
    login()
    print("✓ HuggingFace authenticated")
except Exception as e:
    print(f"⚠️ HuggingFace auth warning: {e}")
    print("If this fails, run: huggingface-cli login")

# Load tokenizer
rm_tokenizer = AutoTokenizer.from_pretrained(REWARD_BASE_MODEL, trust_remote_code=True)
if rm_tokenizer.pad_token is None:
    rm_tokenizer.pad_token = rm_tokenizer.eos_token

# Load base model with HPC optimizations - using both GPUs
model_kwargs = {
    "torch_dtype": torch.bfloat16,
    "device_map": "auto",  # Auto-distribute across both GPUs
    "trust_remote_code": True,
    "low_cpu_mem_usage": True,  # Memory efficient loading
}

# Add Flash Attention if available
if USE_FLASH_ATTENTION:
    model_kwargs["attn_implementation"] = ATTN_IMPLEMENTATION
    print("✓ Flash Attention 2 enabled")

print(f"✓ Using both GPUs (auto device mapping)")

base_model = AutoModelForCausalLM.from_pretrained(REWARD_BASE_MODEL, **model_kwargs)

# Gradient checkpointing disabled for maximum speed with dual GPUs
# (Both 31GB GPUs have sufficient memory for batch_size=8 without checkpointing)
# base_model.gradient_checkpointing_enable()  # Uncomment if running out of memory

# Freeze base model parameters (fine-tune only last layers)
for param in base_model.parameters():
    param.requires_grad = False

# Unfreeze last few layers for fine-tuning
num_unfrozen_layers = RM_UNFROZEN_LAYERS  # From config (6 layers for single GPU)
for layer in base_model.model.layers[-num_unfrozen_layers:]:
    for param in layer.parameters():
        param.requires_grad = True

print(f"✓ Base model loaded (unfrozen last {num_unfrozen_layers} layers)")
print(f"✓ Gradient checkpointing disabled (maximum speed with dual GPUs)")

# Create reward model with dropout regularization
reward_model = RewardModel(
    base_model=base_model,
    hidden_dim=RM_HIDDEN_DIM,
    head_type=RM_HEAD_TYPE,
    dropout=RM_DROPOUT  # Use configured dropout for regularization
)

# Move reward head to cuda:0 and convert to bfloat16 to match base model dtype
reward_model.reward_head = reward_model.reward_head.to('cuda:0').to(torch.bfloat16)

print(f"✓ Reward model created with {RM_HEAD_TYPE} head")
print(f"Total parameters: {sum(p.numel() for p in reward_model.parameters()) / 1e6:.2f}M")
print(f"Trainable parameters: {sum(p.numel() for p in reward_model.parameters() if p.requires_grad) / 1e6:.2f}M")

Loading base model: google/gemma-2-2b...
🚀 HPC Optimized loading



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /google/gemma-2-2b/resolve/main/tokenizer_config.json (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 517afb80-6709-4bb9-a4bd-1eb6ddab2c1f)')' thrown while requesting HEAD https://huggingface.co/google/gemma-2-2b/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
Retrying in 1s [Retry 1/5].


✓ HuggingFace authenticated
✓ Using both GPUs (auto device mapping)
✓ Using both GPUs (auto device mapping)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✓ Base model loaded (unfrozen last 6 layers)
✓ Gradient checkpointing disabled (maximum speed with dual GPUs)
✓ Reward model created with mlp head
Total parameters: 2614.93M
Trainable parameters: 467.79M


## Create DataLoaders

In [9]:
# Create datasets
train_dataset = PreferenceDataset(train_data, rm_tokenizer, max_length=RM_MAX_LENGTH)
val_dataset = PreferenceDataset(val_data, rm_tokenizer, max_length=RM_MAX_LENGTH)

# Create dataloaders with HPC optimized settings
train_loader = DataLoader(
    train_dataset,
    batch_size=RM_BATCH_SIZE,
    shuffle=True,
    num_workers=4,  # Parallel data loading for HPC
    pin_memory=True,  # Faster GPU transfer
    prefetch_factor=2  # Prefetch batches
)

val_loader = DataLoader(
    val_dataset,
    batch_size=RM_BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

Train batches: 5132
Validation batches: 571


In [9]:
# Check GPU device configuration
print("GPU Device Configuration:")
print(f"Number of GPUs available: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

print("\nModel placement (distributed across both GPUs):")

# Count parameters per device
device_0_params = 0
device_1_params = 0
device_0_layers = []
device_1_layers = []

for name, param in reward_model.named_parameters():
    param_size = param.numel()
    if 'cuda:0' in str(param.device):
        device_0_params += param_size
        # Track which layers are on cuda:0
        layer_name = name.split('.')[0:2]
        layer_name = '.'.join(layer_name)
        if layer_name not in device_0_layers:
            device_0_layers.append(layer_name)
    elif 'cuda:1' in str(param.device):
        device_1_params += param_size
        # Track which layers are on cuda:1
        layer_name = name.split('.')[0:2]
        layer_name = '.'.join(layer_name)
        if layer_name not in device_1_layers:
            device_1_layers.append(layer_name)

total_params = device_0_params + device_1_params

print(f"\n✓ GPU 0 (cuda:0):")
print(f"  - Parameters: {device_0_params/1e6:.2f}M ({100*device_0_params/total_params:.1f}%)")
print(f"  - Unique layers: {len(device_0_layers)}")

print(f"\n✓ GPU 1 (cuda:1):")
print(f"  - Parameters: {device_1_params/1e6:.2f}M ({100*device_1_params/total_params:.1f}%)")
print(f"  - Unique layers: {len(device_1_layers)}")

print(f"\n✓ Model IS using both GPUs (auto device map)")

# Show sample of layers on each GPU
if device_0_layers:
    print(f"\nSample layers on GPU 0: {device_0_layers[:3]}")
if device_1_layers:
    print(f"Sample layers on GPU 1: {device_1_layers[:3]}")

GPU Device Configuration:
Number of GPUs available: 2
GPU 0: NVIDIA GeForce RTX 5090
GPU 1: NVIDIA GeForce RTX 5090

Model placement (distributed across both GPUs):

✓ GPU 0 (cuda:0):
  - Parameters: 1291.21M (49.4%)
  - Unique layers: 3

✓ GPU 1 (cuda:1):
  - Parameters: 1323.72M (50.6%)
  - Unique layers: 1

✓ Model IS using both GPUs (auto device map)

Sample layers on GPU 0: ['base_model.model', 'reward_head.0', 'reward_head.3']
Sample layers on GPU 1: ['base_model.model']


## Training Setup

In [None]:
# Bradley-Terry loss with margin for pairwise preferences
def bradley_terry_loss(chosen_rewards, rejected_rewards, margin=0.5):
    """
    Bradley-Terry model loss with margin: -log(sigmoid(r_chosen - r_rejected - margin))
    
    The model learns to make r_chosen > r_rejected by at least 'margin'.
    This encourages stronger preference differentiation.
    """
    # Compute difference with margin requirement
    diff = chosen_rewards - rejected_rewards - margin
    
    # Add numerical stability with clipping
    diff = torch.clamp(diff, min=-10, max=10)
    
    # Loss: -log(sigmoid(diff))
    # This is minimized when diff → ∞ (chosen >> rejected by margin)
    loss = -torch.log(torch.sigmoid(diff) + 1e-8)
    return loss.mean()

# Get trainable parameters
trainable_params = [p for p in reward_model.parameters() if p.requires_grad]

# Optimizer with weight decay for regularization
optimizer = torch.optim.AdamW(
    trainable_params,
    lr=RM_LEARNING_RATE,
    weight_decay=RM_WEIGHT_DECAY
)

# Learning rate scheduler
num_training_steps = len(train_loader) * RM_EPOCHS // RM_GRADIENT_ACCUMULATION_STEPS
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=num_training_steps // 10,
    num_training_steps=num_training_steps
)

# Initialize wandb (optional) - disabled by default
if USE_WANDB:
    import os
    os.environ['WANDB_MODE'] = 'online'
    wandb.init(
        project=WANDB_PROJECT,
        name="reward-model-coldstart",
        config={
            'learning_rate': RM_LEARNING_RATE,
            'batch_size': RM_BATCH_SIZE,
            'epochs': RM_EPOCHS,
            'base_model': REWARD_BASE_MODEL,
            'head_type': RM_HEAD_TYPE,
            'num_gpus': NUM_GPUS
        }
    )
    print("✓ W&B initialized")
else:
    print("✓ W&B disabled (set USE_WANDB=True to enable)")

print("Training setup complete!")
print(f"Total training steps: {num_training_steps}")
print(f"\nTraining on BOTH English and French preferences jointly")
print(f"  Train: {len(train_data)} pairs (EN + FR mixed)")
print(f"  Val: {len(val_data)} pairs (EN + FR mixed)")

✓ W&B disabled (set USE_WANDB=True to enable)
Training setup complete!
Total training steps: 7698

Training on BOTH English and French preferences jointly
  Train: 82108 pairs (EN + FR mixed)
  Val: 9124 pairs (EN + FR mixed)


## Training Loop

In [None]:
def train_epoch(model, loader, optimizer, scheduler, device, gradient_accumulation_steps=1):
    model.train()
    total_loss = 0
    total_accuracy = 0
    num_batches = 0
    
    optimizer.zero_grad()
    
    pbar = tqdm(loader, desc="Training")
    for step, batch in enumerate(pbar):
        # Move to device
        chosen_input_ids = batch['chosen_input_ids'].to(device)
        chosen_attention_mask = batch['chosen_attention_mask'].to(device)
        rejected_input_ids = batch['rejected_input_ids'].to(device)
        rejected_attention_mask = batch['rejected_attention_mask'].to(device)
        
        # Forward pass
        chosen_rewards = model(chosen_input_ids, chosen_attention_mask)
        rejected_rewards = model(rejected_input_ids, rejected_attention_mask)
        
        # DEBUG: Check reward values (first batch only) - convert to float32 for printing
        if step == 0:
            print(f"\n[DEBUG] Batch 0 - First 5 samples:")
            print(f"  Chosen rewards: {chosen_rewards[:5].float().detach().cpu().numpy()}")
            print(f"  Rejected rewards: {rejected_rewards[:5].float().detach().cpu().numpy()}")
            print(f"  Difference (should be > 0.5): {(chosen_rewards - rejected_rewards)[:5].float().detach().cpu().numpy()}")
        
        # Compute loss with margin to enforce strong preference differentiation
        loss = bradley_terry_loss(chosen_rewards, rejected_rewards, margin=0.5)
        loss = loss / gradient_accumulation_steps
        
        # Backward pass
        loss.backward()
        
        # Accuracy: chosen should have higher reward (no margin for accuracy, just basic preference)
        accuracy = (chosen_rewards > rejected_rewards).float().mean()
        
        total_loss += loss.item() * gradient_accumulation_steps
        total_accuracy += accuracy.item()
        num_batches += 1
        
        # Update weights
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress bar
        pbar.set_postfix({
            'loss': f"{total_loss / num_batches:.4f}",
            'acc': f"{total_accuracy / num_batches:.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
    
    return total_loss / num_batches, total_accuracy / num_batches


def validate(model, loader, device):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    num_batches = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            chosen_input_ids = batch['chosen_input_ids'].to(device)
            chosen_attention_mask = batch['chosen_attention_mask'].to(device)
            rejected_input_ids = batch['rejected_input_ids'].to(device)
            rejected_attention_mask = batch['rejected_attention_mask'].to(device)
            
            chosen_rewards = model(chosen_input_ids, chosen_attention_mask)
            rejected_rewards = model(rejected_input_ids, rejected_attention_mask)
            
            # Use margin=0.5 in validation too for consistency
            loss = bradley_terry_loss(chosen_rewards, rejected_rewards, margin=0.5)
            accuracy = (chosen_rewards > rejected_rewards).float().mean()
            
            total_loss += loss.item()
            total_accuracy += accuracy.item()
            num_batches += 1
    
    return total_loss / num_batches, total_accuracy / num_batches

print("Training functions defined (with margin-based Bradley-Terry loss)")

Training functions defined


In [12]:
# Define device for sanity check
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Quick sanity check - BEFORE training (random initialization expected)
print("=" * 80)
print("SANITY CHECK - Pre-Training (Random Initialization Expected)")
print("=" * 80)

reward_model.eval()
with torch.no_grad():
    # Test on first batch
    sample_batch = next(iter(train_loader))
    chosen_ids = sample_batch['chosen_input_ids'][:1].to(device)
    chosen_mask = sample_batch['chosen_attention_mask'][:1].to(device)
    rejected_ids = sample_batch['rejected_input_ids'][:1].to(device)
    rejected_mask = sample_batch['rejected_attention_mask'][:1].to(device)
    
    r_chosen = reward_model(chosen_ids, chosen_mask).item()
    r_rejected = reward_model(rejected_ids, rejected_mask).item()
    
    print(f"\nReward Model Output (Untrained - Random Init):")
    print(f"  Chosen reward: {r_chosen:.4f}")
    print(f"  Rejected reward: {r_rejected:.4f}")
    print(f"  Difference: {r_chosen - r_rejected:.4f}")
    print(f"  Currently correct? {r_chosen > r_rejected}")
    
    print(f"\n⚠️  NOTE: Rewards are similar because model is randomly initialized.")
    print(f"           Training will learn to differentiate chosen vs rejected.")
    
    # Check data
    print(f"\nData Validation:")
    print(f"  Margin (expected difference): {sample_batch['margin'][0]:.4f}")
    print(f"  Data is ready for training ✓")
    
print("=" * 80)

SANITY CHECK - Pre-Training (Random Initialization Expected)

Reward Model Output (Untrained - Random Init):
  Chosen reward: 0.7383
  Rejected reward: 0.7383
  Difference: 0.0000
  Currently correct? False

⚠️  NOTE: Rewards are similar because model is randomly initialized.
           Training will learn to differentiate chosen vs rejected.

Data Validation:
  Margin (expected difference): 0.1667
  Data is ready for training ✓

Reward Model Output (Untrained - Random Init):
  Chosen reward: 0.7383
  Rejected reward: 0.7383
  Difference: 0.0000
  Currently correct? False

⚠️  NOTE: Rewards are similar because model is randomly initialized.
           Training will learn to differentiate chosen vs rejected.

Data Validation:
  Margin (expected difference): 0.1667
  Data is ready for training ✓


In [13]:
# Check for label consistency in preference data
print("\n" + "=" * 80)
print("DATA QUALITY CHECK - Preference Label Consistency")
print("=" * 80)

inconsistent_count = 0
for i, item in enumerate(preference_data[:1000]):  # Check first 1000 samples
    # Check if chosen_score > rejected_score (should be true for valid preferences)
    if item['chosen_score'] <= item['rejected_score']:
        inconsistent_count += 1
        if inconsistent_count <= 5:  # Show first 5 examples
            print(f"\n⚠️  Sample {i}: INCONSISTENT LABEL")
            print(f"   Chosen score:   {item['chosen_score']:.4f}")
            print(f"   Rejected score: {item['rejected_score']:.4f}")
            print(f"   Margin:         {item['margin']:.4f} (should be > 0)")

print(f"\nSample consistency check (first 1000 samples):")
print(f"  Inconsistent labels: {inconsistent_count}/1000 ({100*inconsistent_count/1000:.1f}%)")

if inconsistent_count > 100:
    print(f"\n🔴 CRITICAL: {inconsistent_count/10:.1f}% of data has inverted labels!")
    print(f"   The chosen translations have LOWER scores than rejected ones.")
    print(f"   This will prevent the reward model from learning properly.")
    print(f"\n   Fix: Re-run notebook 1 (synthetic_data_generation.ipynb)")
    print(f"   Make sure preference generation correctly identifies better translations.")
elif inconsistent_count > 0:
    print(f"\n🟡 WARNING: {100*inconsistent_count/1000:.1f}% of samples have inverted labels")
    print(f"   This is acceptable but may impact convergence.")
else:
    print(f"\n✓ All labels are consistent (chosen > rejected)")

print("=" * 80)



DATA QUALITY CHECK - Preference Label Consistency

Sample consistency check (first 1000 samples):
  Inconsistent labels: 0/1000 (0.0%)

✓ All labels are consistent (chosen > rejected)


## ⚠️ CRITICAL FIX: Validation Accuracy Issue

**Problem Identified:**
- Validation accuracy: 0.0028 (essentially 0%)
- Training accuracy: ~0.40
- This indicates the validation function has a device mismatch with multi-GPU setup

**Root Causes:**
1. Reward head on cuda:0 but hidden states distributed across both GPUs → device mismatch
2. Reward values too small (~0.048) to drive learning in Bradley-Terry loss

**Applied Fixes:**
1. ✅ Removed manual device placement in validate() function (was forcing cuda:0, breaks auto device mapping)
2. ✅ Added initialization scaling to RewardModel (multiply final layer by 2.0)
3. ✅ Ensure validation runs with proper device handling

**Action:** Re-run training from cell below with fixed code


In [17]:
# RESTART TRAINING - Reload model with fixes
import time

print("=" * 80)
print("RESTARTING TRAINING WITH FIXED VALIDATION")
print("=" * 80)

# Clear any previous model state
del reward_model
torch.cuda.empty_cache()

print("\n✓ Cleared GPU memory\n")

# Reload base model with HPC optimizations
print(f"Reloading base model: {REWARD_BASE_MODEL}...")

# Define model kwargs (same as before)
model_kwargs = {
    "torch_dtype": torch.bfloat16,
    "device_map": "auto",
    "trust_remote_code": True,
    "low_cpu_mem_usage": True,
}

if USE_FLASH_ATTENTION:
    model_kwargs["attn_implementation"] = ATTN_IMPLEMENTATION

base_model = AutoModelForCausalLM.from_pretrained(REWARD_BASE_MODEL, **model_kwargs)

# Freeze base model parameters
for param in base_model.parameters():
    param.requires_grad = False

# Unfreeze last few layers
num_unfrozen_layers = RM_UNFROZEN_LAYERS
for layer in base_model.model.layers[-num_unfrozen_layers:]:
    for param in layer.parameters():
        param.requires_grad = True

print(f"✓ Base model reloaded (unfrozen last {num_unfrozen_layers} layers)")

# Create NEW reward model with fixed initialization
reward_model = RewardModel(
    base_model=base_model,
    hidden_dim=RM_HIDDEN_DIM,
    head_type=RM_HEAD_TYPE,
    dropout=RM_DROPOUT
)

# Move reward head to cuda:0 and convert to bfloat16
reward_model.reward_head = reward_model.reward_head.to('cuda:0').to(torch.bfloat16)

print(f"✓ Reward model reloaded with fixed initialization")
print(f"  - Reward head scaling: 2.0x (for better learning signal)")

# Recreate optimizer and scheduler
trainable_params = [p for p in reward_model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(
    trainable_params,
    lr=RM_LEARNING_RATE,
    weight_decay=RM_WEIGHT_DECAY
)

num_training_steps = len(train_loader) * RM_EPOCHS // RM_GRADIENT_ACCUMULATION_STEPS
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=num_training_steps // 10,
    num_training_steps=num_training_steps
)

print(f"✓ Optimizer and scheduler recreated")
print(f"\n✅ Ready to restart training with fixes!")


RESTARTING TRAINING WITH FIXED VALIDATION

✓ Cleared GPU memory

Reloading base model: google/gemma-2-2b...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✓ Base model reloaded (unfrozen last 6 layers)
✓ Reward model reloaded with fixed initialization
  - Reward head scaling: 2.0x (for better learning signal)
✓ Optimizer and scheduler recreated

✅ Ready to restart training with fixes!


## ✅ CRITICAL FIX: Margin-Based Bradley-Terry Loss

**Problem with Previous Run:**
- Rewards ARE properly scaled (values like 1.73, -0.52, not tiny -0.048)
- BUT: Model not learning preference differentiation 
- Only 3/5 samples showed chosen > rejected preference in first batch
- Loss function wasn't strong enough to push apart chosen vs rejected

**Solution Applied:**
1. ✅ Updated `bradley_terry_loss()` with margin parameter
2. ✅ Changed from: `-log(sigmoid(r_chosen - r_rejected))`
3. ✅ Changed to: `-log(sigmoid(r_chosen - r_rejected - 0.5))`
4. ✅ Margin = 0.5 means model must make chosen ≥ 0.5 higher than rejected
5. ✅ Updated both train and validation functions to use margin

**Why This Works:**
- Original loss only required chosen > rejected (even by 0.01)
- New loss with margin=0.5 requires a meaningful 0.5-point gap
- Forces the model to learn strong discriminative preference
- Prevents weak, ambiguous preferences

**Action:**
Continue training from the next cell - it will use the updated loss function


In [None]:
# RESTART TRAINING WITH MARGIN-BASED LOSS
# Clear GPU memory and reload model fresh
torch.cuda.empty_cache()
print("=" * 80)
print("RESTARTING TRAINING WITH MARGIN-BASED BRADLEY-TERRY LOSS")
print("=" * 80)

# Reload base model
base_model = AutoModelForCausalLM.from_pretrained(REWARD_BASE_MODEL, **model_kwargs)

# Freeze and unfreeze layers
for param in base_model.parameters():
    param.requires_grad = False

for layer in base_model.model.layers[-RM_UNFROZEN_LAYERS:]:
    for param in layer.parameters():
        param.requires_grad = True

# Create new reward model
reward_model = RewardModel(
    base_model=base_model,
    hidden_dim=RM_HIDDEN_DIM,
    head_type=RM_HEAD_TYPE,
    dropout=RM_DROPOUT
)

reward_model.reward_head = reward_model.reward_head.to('cuda:0').to(torch.bfloat16)

# Recreate optimizer and scheduler
trainable_params = [p for p in reward_model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(
    trainable_params,
    lr=RM_LEARNING_RATE,
    weight_decay=RM_WEIGHT_DECAY
)

num_training_steps = len(train_loader) * RM_EPOCHS // RM_GRADIENT_ACCUMULATION_STEPS
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=num_training_steps // 10,
    num_training_steps=num_training_steps
)

print("\n✅ Model reloaded with fresh state")
print(f"✅ Loss function: bradley_terry_loss(chosen, rejected, margin=0.5)")
print(f"✅ Ready to train with STRONG preference differentiation!")
print(f"\nTraining configuration:")
print(f"  - Epochs: {RM_EPOCHS}")
print(f"  - Batch size: {RM_BATCH_SIZE}")
print(f"  - Learning rate: {RM_LEARNING_RATE:.2e}")
print(f"  - Margin requirement: 0.5 (chosen must be ≥0.5 higher than rejected)")


In [14]:

# GPU MONITORING - Real-time GPU usage during training
import subprocess
import threading

class GPUMonitor:
    """Monitor GPU usage in real-time during training"""
    def __init__(self, interval=10):
        self.interval = interval
        self.is_running = False
        self.gpu_usage = []
        self.thread = None
    
    def start(self):
        """Start monitoring in background thread"""
        self.is_running = True
        self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.thread.start()
    
    def stop(self):
        """Stop monitoring"""
        self.is_running = False
        if self.thread:
            self.thread.join(timeout=5)
    
    def _monitor_loop(self):
        """Background monitoring loop"""
        while self.is_running:
            try:
                output = subprocess.check_output([
                    'nvidia-smi', '--query-gpu=index,memory.used,memory.total,utilization.gpu',
                    '--format=csv,noheader,nounits'
                ]).decode('utf-8')
                
                gpu_stats = []
                for line in output.strip().split('\n'):
                    parts = line.split(',')
                    if len(parts) >= 3:
                        gpu_idx = parts[0].strip()
                        mem_used = float(parts[1].strip())
                        mem_total = float(parts[2].strip())
                        util = float(parts[3].strip()) if len(parts) > 3 else 0
                        gpu_stats.append({
                            'gpu': gpu_idx,
                            'mem_used': mem_used,
                            'mem_total': mem_total,
                            'util': util
                        })
                
                self.gpu_usage.append(gpu_stats)
            except:
                pass
            
            time.sleep(self.interval)
    
    def summary(self):
        """Print GPU usage summary"""
        if not self.gpu_usage:
            print("No GPU data collected")
            return
        
        print("\n" + "=" * 80)
        print("GPU USAGE SUMMARY (during training)")
        print("=" * 80)
        
        # Get stats per GPU
        gpu_stats = {}
        for reading in self.gpu_usage:
            for stat in reading:
                gpu = stat['gpu']
                if gpu not in gpu_stats:
                    gpu_stats[gpu] = {'util': [], 'mem': []}
                gpu_stats[gpu]['util'].append(stat['util'])
                gpu_stats[gpu]['mem'].append(stat['mem_used'])
        
        # Print summary
        for gpu_idx in sorted(gpu_stats.keys()):
            stats = gpu_stats[gpu_idx]
            avg_util = sum(stats['util']) / len(stats['util']) if stats['util'] else 0
            avg_mem = sum(stats['mem']) / len(stats['mem']) if stats['mem'] else 0
            max_util = max(stats['util']) if stats['util'] else 0
            max_mem = max(stats['mem']) if stats['mem'] else 0
            
            print(f"\nGPU {gpu_idx}:")
            print(f"  Utilization: Avg {avg_util:.1f}%, Max {max_util:.1f}%")
            print(f"  Memory Used: Avg {avg_mem:.0f} MiB, Max {max_mem:.0f} MiB")
            
            # Determine if GPU was actually used
            if avg_util > 5:
                print(f"  Status: ✓ ACTIVELY USED")
            else:
                print(f"  Status: ⚠️  IDLE (not actively used)")

print("GPU Monitor class defined")


GPU Monitor class defined


In [None]:
# Train the model 
import time
from datetime import timedelta

# Use cuda:0 as primary device (both GPUs will be used via auto device_map)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")
print(f"Training on both GPUs (distributed via auto device_map)")

# Check GPU status BEFORE training
print("\n" + "=" * 80)
print("PRE-TRAINING GPU STATUS:")
print("=" * 80)
os.system('nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader | head -2')

# Start GPU monitoring in background
gpu_monitor = GPUMonitor(interval=5)
gpu_monitor.start()

print("\nStarting training (FRESH START with fixed validation)...\n")
best_val_accuracy = 0  # RESET from previous run
training_start_time = time.time()

try:
    for epoch in range(RM_EPOCHS):
        epoch_start_time = time.time()
        
        print(f"\nEpoch {epoch + 1}/{RM_EPOCHS}")
        print("=" * 80)
        print(f"Training on {len(train_data)} preference pairs (English + French)")
        
        # Train
        train_loss, train_acc = train_epoch(
            reward_model,
            train_loader,
            optimizer,
            lr_scheduler,
            device,
            gradient_accumulation_steps=RM_GRADIENT_ACCUMULATION_STEPS
        )
        
        # Validate
        val_loss, val_acc = validate(reward_model, val_loader, device)
        
        # Calculate timing
        epoch_elapsed = time.time() - epoch_start_time
        total_elapsed = time.time() - training_start_time
        epochs_completed = epoch + 1
        epochs_remaining = RM_EPOCHS - epochs_completed
        estimated_remaining = (total_elapsed / epochs_completed) * epochs_remaining
        
        print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} (EN + FR)")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} (EN + FR)")
        print(f"\n⏱️  Time - Epoch: {timedelta(seconds=int(epoch_elapsed))}, "
              f"Total: {timedelta(seconds=int(total_elapsed))}, "
              f"Remaining: {timedelta(seconds=int(estimated_remaining))}")
        
        # Log to wandb
        if USE_WANDB:
            wandb.log({
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'train_accuracy': train_acc,
                'val_loss': val_loss,
                'val_accuracy': val_acc
            })
        
        # Save best model
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            print(f"\n✓ New best validation accuracy: {best_val_accuracy:.4f}")
            print(f"Saving model to {REWARD_MODEL_COLD_START}...")
            
            # Save model
            REWARD_MODEL_COLD_START.mkdir(exist_ok=True, parents=True)
            
            torch.save({
                'model_state_dict': reward_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'val_accuracy': val_acc,
                'config': {
                    'base_model': REWARD_BASE_MODEL,
                    'head_type': RM_HEAD_TYPE,
                    'hidden_dim': RM_HIDDEN_DIM
                }
            }, REWARD_MODEL_COLD_START / "reward_model.pt")
            
            # Save tokenizer
            rm_tokenizer.save_pretrained(REWARD_MODEL_COLD_START)

finally:
    # Stop GPU monitoring
    gpu_monitor.stop()

print(f"\n{'=' * 80}")
print("Training complete!")
print(f"Best validation accuracy: {best_val_accuracy:.4f}")

# Final training time
total_training_time = time.time() - training_start_time
avg_time_per_epoch = total_training_time / RM_EPOCHS
print(f"\n⏱️  Total Training Time: {timedelta(seconds=int(total_training_time))}")
print(f"    Average per epoch: {timedelta(seconds=int(avg_time_per_epoch))}")

# Print GPU usage summary
gpu_monitor.summary()

print(f"\nModel trained jointly on:")
print(f"  • English → Arabic preferences")
print(f"  • French → Arabic preferences")

if USE_WANDB:
    wandb.finish()


Using device: cuda:0
Training on both GPUs (distributed via auto device_map)

PRE-TRAINING GPU STATUS:
0, NVIDIA GeForce RTX 5090, 5987 MiB, 32607 MiB
1, NVIDIA GeForce RTX 5090, 3247 MiB, 32607 MiB

Starting training (FRESH START with fixed validation)...


Epoch 1/3
Training on 82108 preference pairs (English + French)


Training:   0%|          | 1/5132 [00:01<1:27:14,  1.02s/it, loss=1.1328, acc=0.2500, lr=0.00e+00]


[DEBUG] Batch 0 - First 5 samples:
  Chosen rewards: [ 0.7109375  -0.51953125  1.734375    1.9296875   0.5703125 ]
  Rejected rewards: [ 1.625       1.5703125   0.3515625   1.515625   -0.37695312]
  Difference (should be positive): [-0.9140625 -2.09375    1.3828125  0.4140625  0.9453125]


Training: 100%|██████████| 5132/5132 [1:21:00<00:00,  1.06it/s, loss=0.7055, acc=0.4332, lr=1.26e-05]
Training: 100%|██████████| 5132/5132 [1:21:00<00:00,  1.06it/s, loss=0.7055, acc=0.4332, lr=1.26e-05]
Validation: 100%|██████████| 571/571 [06:42<00:00,  1.42it/s]




Train Loss: 0.7055 | Train Acc: 0.4332 (EN + FR)
Val Loss: 0.6953 | Val Acc: 0.0032 (EN + FR)

⏱️  Time - Epoch: 1:27:43, Total: 1:27:43, Remaining: 2:55:27

✓ New best validation accuracy: 0.0032
Saving model to models/reward_model_coldstart...

Epoch 2/3
Training on 82108 preference pairs (English + French)

Epoch 2/3
Training on 82108 preference pairs (English + French)


Training:   0%|          | 1/5132 [00:00<1:21:50,  1.04it/s, loss=0.6914, acc=0.3750, lr=1.26e-05]


[DEBUG] Batch 0 - First 5 samples:
  Chosen rewards: [-0.0300293  -0.02709961 -0.06494141 -0.03564453 -0.06494141]
  Rejected rewards: [-0.06494141 -0.05126953 -0.06494141 -0.06494141 -0.06494141]
  Difference (should be positive): [0.03491211 0.02416992 0.         0.02929688 0.        ]


Training:  40%|███▉      | 2048/5132 [32:18<49:31,  1.04it/s, loss=0.6911, acc=0.3596, lr=9.66e-06]  

## Test Reward Model

In [None]:
# Test the trained reward model
reward_model.eval()

print("Testing reward model on sample translations...\n")
print("=" * 80)

# Get some test examples
test_samples = random.sample(val_data, min(5, len(val_data)))

for i, sample in enumerate(test_samples, 1):
    print(f"\nExample {i}:")
    print(f"Source: {sample['source'][:100]}...")
    
    # Prepare inputs
    chosen_text = f"Source: {sample['source']}\nTranslation: {sample['chosen']}"
    rejected_text = f"Source: {sample['source']}\nTranslation: {sample['rejected']}"
    
    chosen_tokens = rm_tokenizer(
        chosen_text,
        max_length=RM_MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    rejected_tokens = rm_tokenizer(
        rejected_text,
        max_length=RM_MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    # Get rewards
    with torch.no_grad():
        chosen_reward = reward_model(
            chosen_tokens['input_ids'],
            chosen_tokens['attention_mask']
        ).item()
        
        rejected_reward = reward_model(
            rejected_tokens['input_ids'],
            rejected_tokens['attention_mask']
        ).item()
    
    print(f"\nChosen translation: {sample['chosen'][:100]}...")
    print(f"Chosen reward: {chosen_reward:.4f} (original score: {sample['chosen_score']:.4f})")
    
    print(f"\nRejected translation: {sample['rejected'][:100]}...")
    print(f"Rejected reward: {rejected_reward:.4f} (original score: {sample['rejected_score']:.4f})")
    
    print(f"\nReward margin: {chosen_reward - rejected_reward:.4f}")
    print(f"Correct preference: {'✓' if chosen_reward > rejected_reward else '✗'}")
    print("=" * 80)

## Next Step

Proceed to **notebook 3** to run PPO optimization using this trained reward model.