In [None]:
# Run setup from config notebook
%run 0_config_setup.ipynb

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from tqdm import tqdm
import json
import wandb
import itertools

set_seed(SEED)

## Load Human Feedback Data

In [None]:
print(f"Loading human feedback from {HUMAN_PREFERENCES}...")

human_feedback = []
try:
    with open(HUMAN_PREFERENCES, 'r', encoding='utf-8') as f:
        for line in f:
            human_feedback.append(json.loads(line))
    print(f"✓ Loaded {len(human_feedback)} feedback entries")
except FileNotFoundError:
    print("No human feedback file found!")
    print("Please collect feedback using notebook 4 first.")
    raise

if len(human_feedback) < 100:
    print(f"\n⚠️ Warning: Only {len(human_feedback)} feedback entries found.")
    print("Recommended: 500+ entries for effective fine-tuning.")
    print("Continue anyway? The model may not improve significantly with limited data.")

## Convert to Preference Pairs

In [None]:
def extract_preference_pairs(feedback_entries):
    """Convert human feedback to preference pairs"""
    
    preference_pairs = []
    
    for entry in feedback_entries:
        source = entry['source']
        candidates = entry['candidates']
        ranking = entry.get('user_ranking', [])
        custom = entry.get('custom_translation')
        
        # Case 1: User provided ranking
        if ranking and len(ranking) >= 2:
            # Convert ranking to preference pairs
            # ranking[0] is best, ranking[1] is second best, etc.
            for i in range(len(ranking)):
                for j in range(i + 1, len(ranking)):
                    better_idx = ranking[i] - 1  # Convert to 0-based
                    worse_idx = ranking[j] - 1
                    
                    if 0 <= better_idx < len(candidates) and 0 <= worse_idx < len(candidates):
                        preference_pairs.append({
                            'source': source,
                            'chosen': candidates[better_idx]['translation'],
                            'rejected': candidates[worse_idx]['translation'],
                            'type': 'ranking'
                        })
        
        # Case 2: User provided custom translation
        if custom:
            # Custom translation is better than all candidates
            for candidate in candidates:
                preference_pairs.append({
                    'source': source,
                    'chosen': custom,
                    'rejected': candidate['translation'],
                    'type': 'custom'
                })
    
    return preference_pairs

print("Converting feedback to preference pairs...")
human_preference_pairs = extract_preference_pairs(human_feedback)

print(f"\n✓ Extracted {len(human_preference_pairs)} preference pairs")
print(f"  From {len(human_feedback)} feedback entries")

# Split train/val
train_size = int(0.9 * len(human_preference_pairs))
train_pairs = human_preference_pairs[:train_size]
val_pairs = human_preference_pairs[train_size:]

print(f"\nTrain: {len(train_pairs)} pairs")
print(f"Validation: {len(val_pairs)} pairs")

## Load Cold-Start Reward Model

In [None]:
print(f"Loading reward model from {REWARD_MODEL_COLD_START}...")

# Load tokenizer
rm_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_COLD_START)
if rm_tokenizer.pad_token is None:
    rm_tokenizer.pad_token = rm_tokenizer.eos_token

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    REWARD_BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# Recreate reward model
class RewardModel(nn.Module):
    def __init__(self, base_model, hidden_dim=256, head_type='mlp'):
        super().__init__()
        self.base_model = base_model
        self.head_type = head_type
        self.hidden_size = base_model.config.hidden_size
        
        if head_type == 'linear':
            self.reward_head = nn.Linear(self.hidden_size, 1)
        elif head_type == 'mlp':
            self.reward_head = nn.Sequential(
                nn.Linear(self.hidden_size, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Linear(hidden_dim, 1)
            )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        hidden_states = outputs.hidden_states[-1]
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = hidden_states.shape[0]
        pooled = hidden_states[torch.arange(batch_size), sequence_lengths]
        reward = self.reward_head(pooled)
        return reward.squeeze(-1)

reward_model = RewardModel(
    base_model=base_model,
    hidden_dim=RM_HIDDEN_DIM,
    head_type=RM_HEAD_TYPE
)

# Load cold-start weights
checkpoint = torch.load(
    REWARD_MODEL_COLD_START / "reward_model.pt",
    map_location='cpu'
)
reward_model.load_state_dict(checkpoint['model_state_dict'])

print("✓ Cold-start reward model loaded")

# Unfreeze for fine-tuning
for param in reward_model.parameters():
    param.requires_grad = False

# Unfreeze reward head and last few layers
for param in reward_model.reward_head.parameters():
    param.requires_grad = True

for layer in reward_model.base_model.model.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

print(f"Trainable parameters: {sum(p.numel() for p in reward_model.parameters() if p.requires_grad) / 1e6:.2f}M")

## Create DataLoaders

In [None]:
class PreferenceDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        chosen_text = f"Source: {item['source']}\nTranslation: {item['chosen']}"
        rejected_text = f"Source: {item['source']}\nTranslation: {item['rejected']}"
        
        chosen_tokens = self.tokenizer(
            chosen_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        rejected_tokens = self.tokenizer(
            rejected_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'chosen_input_ids': chosen_tokens['input_ids'].squeeze(0),
            'chosen_attention_mask': chosen_tokens['attention_mask'].squeeze(0),
            'rejected_input_ids': rejected_tokens['input_ids'].squeeze(0),
            'rejected_attention_mask': rejected_tokens['attention_mask'].squeeze(0),
        }

# Create datasets
train_dataset = PreferenceDataset(train_pairs, rm_tokenizer, max_length=RM_MAX_LENGTH)
val_dataset = PreferenceDataset(val_pairs, rm_tokenizer, max_length=RM_MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=RM_BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=RM_BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

## Fine-tune Reward Model

In [None]:
# Training setup
def bradley_terry_loss(chosen_rewards, rejected_rewards):
    return -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

optimizer = torch.optim.AdamW(
    [p for p in reward_model.parameters() if p.requires_grad],
    lr=RM_LEARNING_RATE * 0.5  # Lower LR for fine-tuning
)

num_training_steps = len(train_loader) * RM_EPOCHS // RM_GRADIENT_ACCUMULATION_STEPS
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=num_training_steps // 10,
    num_training_steps=num_training_steps
)

if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name="reward-model-human-aligned",
        config={'stage': 'human_alignment', 'base': 'cold_start_rm'}
    )

print("Starting reward model fine-tuning with human preferences...\n")

In [None]:
# Training functions (same as notebook 2)
def train_epoch(model, loader, optimizer, scheduler, device, gradient_accumulation_steps=1):
    model.train()
    total_loss = 0
    total_accuracy = 0
    num_batches = 0
    
    optimizer.zero_grad()
    
    pbar = tqdm(loader, desc="Training")
    for step, batch in enumerate(pbar):
        chosen_input_ids = batch['chosen_input_ids'].to(device)
        chosen_attention_mask = batch['chosen_attention_mask'].to(device)
        rejected_input_ids = batch['rejected_input_ids'].to(device)
        rejected_attention_mask = batch['rejected_attention_mask'].to(device)
        
        chosen_rewards = model(chosen_input_ids, chosen_attention_mask)
        rejected_rewards = model(rejected_input_ids, rejected_attention_mask)
        
        loss = bradley_terry_loss(chosen_rewards, rejected_rewards)
        loss = loss / gradient_accumulation_steps
        loss.backward()
        
        accuracy = (chosen_rewards > rejected_rewards).float().mean()
        
        total_loss += loss.item() * gradient_accumulation_steps
        total_accuracy += accuracy.item()
        num_batches += 1
        
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        pbar.set_postfix({
            'loss': f"{total_loss / num_batches:.4f}",
            'acc': f"{total_accuracy / num_batches:.4f}"
        })
    
    return total_loss / num_batches, total_accuracy / num_batches


def validate(model, loader, device):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    num_batches = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            chosen_input_ids = batch['chosen_input_ids'].to(device)
            chosen_attention_mask = batch['chosen_attention_mask'].to(device)
            rejected_input_ids = batch['rejected_input_ids'].to(device)
            rejected_attention_mask = batch['rejected_attention_mask'].to(device)
            
            chosen_rewards = model(chosen_input_ids, chosen_attention_mask)
            rejected_rewards = model(rejected_input_ids, rejected_attention_mask)
            
            loss = bradley_terry_loss(chosen_rewards, rejected_rewards)
            accuracy = (chosen_rewards > rejected_rewards).float().mean()
            
            total_loss += loss.item()
            total_accuracy += accuracy.item()
            num_batches += 1
    
    return total_loss / num_batches, total_accuracy / num_batches

# Train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reward_model = reward_model.to(device)

best_val_accuracy = 0

for epoch in range(RM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{RM_EPOCHS}")
    print("=" * 80)
    
    train_loss, train_acc = train_epoch(
        reward_model, train_loader, optimizer, lr_scheduler, device,
        gradient_accumulation_steps=RM_GRADIENT_ACCUMULATION_STEPS
    )
    
    val_loss, val_acc = validate(reward_model, val_loader, device)
    
    print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    
    if USE_WANDB:
        wandb.log({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_accuracy': train_acc,
            'val_loss': val_loss,
            'val_accuracy': val_acc
        })
    
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        print(f"\n✓ New best validation accuracy: {best_val_accuracy:.4f}")
        
        REWARD_MODEL_HUMAN_ALIGNED.mkdir(exist_ok=True, parents=True)
        torch.save({
            'model_state_dict': reward_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'val_accuracy': val_acc,
            'config': {
                'base_model': REWARD_BASE_MODEL,
                'head_type': RM_HEAD_TYPE,
                'hidden_dim': RM_HIDDEN_DIM,
                'stage': 'human_aligned'
            }
        }, REWARD_MODEL_HUMAN_ALIGNED / "reward_model.pt")
        
        rm_tokenizer.save_pretrained(REWARD_MODEL_HUMAN_ALIGNED)

print(f"\n{'=' * 80}")
print("Reward model fine-tuning complete!")
print(f"Best validation accuracy: {best_val_accuracy:.4f}")

## Re-run PPO with Human-Aligned Reward Model

Now use the human-aligned reward model to further optimize the translation model.

In [None]:
# This is similar to notebook 3, but using the human-aligned reward model
print("\nPreparing for final PPO optimization...")
print("This will use the human-aligned reward model.\n")

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from datasets import Dataset

# Load policy model (start from cold-start PPO model)
print(f"Loading policy model from {PPO_MODEL_COLD_START}...")
policy_tokenizer = AutoTokenizer.from_pretrained(PPO_MODEL_COLD_START)
if policy_tokenizer.pad_token is None:
    policy_tokenizer.pad_token = policy_tokenizer.eos_token

policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    PPO_MODEL_COLD_START,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
print("✓ Policy model loaded")

# Load reference model
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    PPO_MODEL_COLD_START,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
for param in ref_model.parameters():
    param.requires_grad = False
print("✓ Reference model loaded")

# Reward model is already loaded and fine-tuned
reward_model.eval()
for param in reward_model.parameters():
    param.requires_grad = False
print("✓ Using human-aligned reward model")

In [None]:
# Prepare training prompts (no parallel corpus needed)
try:
    all_data = load_test_prompts(TEST_PROMPTS)
except:
    all_data = [
        {"source": "Hello, how are you?", "source_lang": "en"},
        {"source": "Good morning.", "source_lang": "en"},
        {"source": "Bonjour, comment allez-vous?", "source_lang": "fr"},
    ] * 1000

prompts = []
for item in all_data[:5000]:
    prompt = format_translation_prompt(item['source'], item['source_lang'])
    prompts.append({'query': prompt, 'source': item['source']})

dataset = Dataset.from_list(prompts)
print(f"Loaded {len(dataset)} training prompts")

In [None]:
# PPO configuration
ppo_config = PPOConfig(
    model_name=str(PPO_MODEL_COLD_START),
    learning_rate=PPO_LEARNING_RATE * 0.5,  # Lower LR for fine-tuning
    batch_size=PPO_BATCH_SIZE,
    mini_batch_size=PPO_MINI_BATCH_SIZE,
    gradient_accumulation_steps=PPO_GRADIENT_ACCUMULATION_STEPS,
    ppo_epochs=PPO_EPOCHS,
    init_kl_coef=KL_PENALTY_COEF,
    target_kl=0.1,
    cliprange=CLIP_RANGE,
    cliprange_value=VALUE_CLIP_RANGE,
    vf_coef=0.1,
    seed=SEED,
    log_with="wandb" if USE_WANDB else None,
    tracker_project_name=WANDB_PROJECT,
    tracker_kwargs={"name": "ppo-human-aligned"},
)

# Initialize PPO trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=policy_model,
    ref_model=ref_model,
    tokenizer=policy_tokenizer,
    dataset=dataset,
)

print("PPO Trainer initialized for final optimization!")

In [None]:
# Reward function using human-aligned reward model
def compute_reward(source_texts, translations):
    rewards = []
    for source, translation in zip(source_texts, translations):
        text = f"Source: {source}\nTranslation: {translation}"
        inputs = rm_tokenizer(
            text,
            max_length=RM_MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(reward_model.base_model.device)
        
        with torch.no_grad():
            reward = reward_model(inputs['input_ids'], inputs['attention_mask'])
        rewards.append(reward.cpu())
    return rewards

# Generation settings
generation_kwargs = {
    "max_new_tokens": PPO_MAX_NEW_TOKENS,
    "temperature": PPO_TEMPERATURE,
    "top_k": 50,
    "top_p": 0.95,
    "do_sample": True,
    "pad_token_id": policy_tokenizer.pad_token_id,
    "eos_token_id": policy_tokenizer.eos_token_id,
}

print("Starting final PPO training with human-aligned rewards...\n")
print(f"Total steps: {PPO_STEPS}")
print("=" * 80)

In [None]:
# Training loop
for step, batch in enumerate(tqdm(ppo_trainer.dataloader, total=PPO_STEPS)):
    if step >= PPO_STEPS:
        break
    
    query_tensors = batch['input_ids']
    response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs)
    
    batch_texts = policy_tokenizer.batch_decode(query_tensors, skip_special_tokens=True)
    response_texts = policy_tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
    
    # Extract source texts
    source_texts = []
    for text in batch_texts:
        if "English text to Arabic:" in text:
            source = text.split("English text to Arabic:")[1].split("\n\nArabic translation:")[0].strip()
        elif "French text to Arabic:" in text:
            source = text.split("French text to Arabic:")[1].split("\n\nArabic translation:")[0].strip()
        else:
            source = text
        source_texts.append(source)
    
    rewards = compute_reward(source_texts, response_texts)
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    
    if step % 10 == 0:
        ppo_trainer.log_stats(stats, batch, rewards)
    
    if step % 50 == 0:
        mean_reward = torch.tensor(rewards).mean().item()
        print(f"\nStep {step}: Mean reward: {mean_reward:.4f} | KL: {stats['objective/kl']:.4f}")
    
    if step > 0 and step % 200 == 0:
        checkpoint_path = PPO_MODEL_FINAL / f"checkpoint-{step}"
        checkpoint_path.mkdir(exist_ok=True, parents=True)
        ppo_trainer.model.save_pretrained(checkpoint_path)
        policy_tokenizer.save_pretrained(checkpoint_path)
        print(f"\n✓ Checkpoint saved to {checkpoint_path}")

print("\n" + "=" * 80)
print("Final PPO training complete!")

## Save Final Production Model

In [None]:
print(f"Saving final production model to {PPO_MODEL_FINAL}...")

PPO_MODEL_FINAL.mkdir(exist_ok=True, parents=True)
ppo_trainer.model.save_pretrained(PPO_MODEL_FINAL)
policy_tokenizer.save_pretrained(PPO_MODEL_FINAL)

training_info = {
    'base_model': SFT_MODEL_PATH,
    'reward_model': str(REWARD_MODEL_HUMAN_ALIGNED),
    'stage': 'human_aligned',
    'human_feedback_entries': len(human_feedback),
    'preference_pairs': len(human_preference_pairs),
    'ppo_steps': PPO_STEPS,
    'final_training': datetime.now().isoformat()
}

with open(PPO_MODEL_FINAL / "training_info.json", 'w') as f:
    json.dump(training_info, f, indent=2)

print("✓ Final model saved successfully!")
print(f"\nPath: {PPO_MODEL_FINAL}")
print("\nThis model is now ready for production use!")

if USE_WANDB:
    wandb.finish()

## Summary

**Congratulations!** You have completed the full RLHF pipeline:

1. ✅ **Phase 1**: Supervised Fine-Tuning (Gemma-2X289B - pre-completed)
2. ✅ **Phase 2.1**: Generated synthetic preference data from automatic metrics
3. ✅ **Phase 2.2**: Trained cold-start reward model
4. ✅ **Phase 2.3**: Ran PPO optimization with synthetic rewards
5. ✅ **Phase 3**: Collected human feedback and fine-tuned reward model
6. ✅ **Phase 3**: Re-ran PPO with human-aligned rewards

### Final Models:
- **Translation Model**: `{PPO_MODEL_FINAL}`
- **Reward Model**: `{REWARD_MODEL_HUMAN_ALIGNED}`

### Continuous Improvement:
To continuously improve the system:
1. Keep collecting user feedback using notebook 4
2. Periodically re-run this notebook to update models
3. Monitor translation quality metrics

### Next Steps:
- Deploy the final model in production
- Set up automated feedback collection
- Schedule regular retraining cycles