In [1]:
# Run setup from config notebook
%run 0_config_setup.ipynb

GPU Available: True
Number of GPUs: 2
  GPU 0: NVIDIA GeForce RTX 5090
    Memory: 33.67 GB
  GPU 1: NVIDIA GeForce RTX 5090
    Memory: 33.67 GB

Total VRAM: 67.34 GB
No Hugging Face token found. Please run `huggingface-cli login`.
‚úÖ Detected project directory: .
üìÅ Data directory: data
üìÅ Models directory: models
‚ö†Ô∏è Translation data not found at: Data/english-arabic
‚úÖ Directory structure and model paths are ready!
HPC Config: 2 GPUs, Flash Attention: False
COMET Config: model=Unbabel/wmt22-cometkiwi-da, batch_size=64, gpu=1
‚úÖ Configuration loaded successfully!
   - COMET enabled for scoring
   - LoRA enabled: True
   - KL penalty: 0.15
   - Sample size: 100,000
   - Batch sizes: Generation=64, RM=32, PPO=32
Utility functions loaded!
Configuration saved to config.json


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import json
import random
import gc
import os
import time
from pathlib import Path

# ===========================
# CONFIGURATION: EVALUATION METHOD
# ===========================
USE_COMET = False  # Set to True to use COMET model, False for fast heuristic-based scoring

set_seed(SEED)

print("Synthetic Data Generation Pipeline")
print("=" * 80)
print(f"Scoring method: {'COMET-based' if USE_COMET else 'Heuristic-based'}")
print(f"Heuristic metrics: Length ratio, Punctuation presence, Non-empty validation")
print("=" * 80)


Synthetic Data Generation Pipeline
Scoring method: Heuristic-based
Heuristic metrics: Length ratio, Punctuation presence, Non-empty validation


## Load SFT Model

In [3]:
# Clear GPU cache and set CUDA environment variables
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared")

# CUDA environment variables for optimized memory management
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Unload COMET model if loaded to free memory for SFT model
if USE_COMET and 'comet_model' in globals() and comet_model is not None:
    try:
        del comet_model
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        print("COMET unloaded from GPU memory")
    except Exception as e:
        print(f"Warning: Failed to unload COMET: {e}")

# ===========================
# MODEL LOADING CONFIGURATION
# ===========================
FORCE_CPU = False
USE_BFLOAT16 = True

print("\nModel Loading Configuration:")
print(f"Total GPUs: {NUM_GPUS}")
print(f"Total VRAM: {NUM_GPUS * 31.36:.2f}GB")
print(f"SFT Model: 28.9B parameters")
print(f"Quantization: 8-bit")
print(f"Precision: {'bfloat16' if USE_BFLOAT16 else 'float32'}")


GPU cache cleared

Model Loading Configuration:
Total GPUs: 2
Total VRAM: 62.72GB
SFT Model: 28.9B parameters
Quantization: 8-bit
Precision: bfloat16


In [4]:
print("\nLoading SFT model from Hugging Face...")
model_name = "ModelSpace/GemmaX2-28-9B-v0.1"

if FORCE_CPU:
    print("Loading model on CPU")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map={"": "cpu"},
        torch_dtype=torch.float32,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
else:
    print(f"Loading model with 8-bit quantization across {NUM_GPUS} GPUs")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # Configure memory for both GPUs
    max_memory = {
        0: "31GB",
        1: "31GB",
        "cpu": "64GB"
    }
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # Automatically distributes model layers across GPUs
        load_in_8bit=True,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        max_memory=max_memory
    )
    
    print("\nGPU Memory Allocation:")
    if torch.cuda.is_available():
        for i in range(NUM_GPUS):
            allocated = torch.cuda.memory_allocated(i) / 1e9
            reserved = torch.cuda.memory_reserved(i) / 1e9
            total = torch.cuda.get_device_properties(i).total_memory / 1e9
            print(f"  GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total")

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = "left"

# Report detailed device placement
try:
    if hasattr(model, 'hf_device_map'):
        devices_used = set(str(v) for v in model.hf_device_map.values())
        print(f"\nModel layers distributed across: {devices_used}")
        
        # Count layers per device
        device_layer_count = {}
        for layer_name, device in model.hf_device_map.items():
            device_str = str(device)
            device_layer_count[device_str] = device_layer_count.get(device_str, 0) + 1
        
        print("Layer distribution:")
        for device, count in sorted(device_layer_count.items()):
            print(f"  Device {device}: {count} layers")
    else:
        print(f"Model device: {next(model.parameters()).device}")
except Exception as e:
    print(f"Device info: {e}")

total_params = sum(p.numel() for p in model.parameters()) / 1e9
print(f"\nModel size: {total_params:.2f}B parameters")
print("Model ready for inference (layers distributed across GPUs)")



Loading SFT model from Hugging Face...
Loading model with 8-bit quantization across 2 GPUs


`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]


GPU Memory Allocation:
  GPU 0: 4.4GB allocated, 4.5GB reserved, 33.7GB total
  GPU 1: 5.8GB allocated, 5.9GB reserved, 33.7GB total

Model layers distributed across: {'1', '0'}
Layer distribution:
  Device 0: 15 layers
  Device 1: 31 layers

Model size: 9.24B parameters
Model ready for inference (layers distributed across GPUs)


## Load Training Data

In [18]:
# ===========================
# LOAD TRAINING DATA (EN + FR)
# ===========================
USE_SAMPLES = False  # Set False for full dataset, True for samples

print("\nLoading source data for synthetic translation generation...")
print(f"Data source: {'SAMPLES' if USE_SAMPLES else 'FULL'}\n")

all_data = []

# Load English data
english_inputs_path = PROJECT_DIR / ("data/english_inputs_samples.json" if USE_SAMPLES else "data/english_inputs.json")

if english_inputs_path.exists():
    with open(english_inputs_path, 'r', encoding='utf-8') as f:
        english_data = json.load(f)
    
    if isinstance(english_data, list):
        for item in english_data:
            if isinstance(item, str):
                all_data.append({'source': item, 'source_lang': 'en'})
            elif isinstance(item, dict):
                text = item.get('text', item.get('source', item.get('sentence', '')))
                if text:
                    all_data.append({'source': text, 'source_lang': 'en'})
    print(f"Loaded {len(english_data)} English samples")
else:
    print(f"Warning: {english_inputs_path.name} not found")

# Load French data
french_inputs_path = PROJECT_DIR / "data/french_inputs.json"

if french_inputs_path.exists():
    with open(french_inputs_path, 'r', encoding='utf-8') as f:
        french_data = json.load(f)
    
    if isinstance(french_data, list):
        for item in french_data:
            if isinstance(item, str):
                all_data.append({'source': item, 'source_lang': 'fr'})
            elif isinstance(item, dict):
                text = item.get('text', item.get('source', item.get('sentence', '')))
                if text:
                    all_data.append({'source': text, 'source_lang': 'fr'})
    print(f"Loaded {len(french_data)} French samples")
else:
    print(f"Warning: french_inputs.json not found")

print(f"\nTotal available data: {len(all_data):,} samples")

# ===========================
# SAMPLE DATA FOR BALANCED TRAINING
# ===========================
SAMPLE_SIZE_PER_LANG = 10_000  # 10K per language = 20K total

en_data = [s for s in all_data if s['source_lang'] == 'en']
fr_data = [s for s in all_data if s['source_lang'] == 'fr']

print(f"Available by language:")
print(f"  English: {len(en_data):,} samples")
print(f"  French: {len(fr_data):,} samples")

random.shuffle(en_data)
random.shuffle(fr_data)

en_samples = en_data[:min(SAMPLE_SIZE_PER_LANG, len(en_data))]
fr_samples = fr_data[:min(SAMPLE_SIZE_PER_LANG, len(fr_data))]

training_samples = en_samples + fr_samples
random.shuffle(training_samples)

total_samples = len(training_samples)
en_pct = 100 * len(en_samples) / total_samples if total_samples > 0 else 0
fr_pct = 100 * len(fr_samples) / total_samples if total_samples > 0 else 0

print(f"\nSampled {total_samples:,} samples for generation:")
print(f"  English to Arabic: {len(en_samples):,} ({en_pct:.1f}%)")
print(f"  French to Arabic: {len(fr_samples):,} ({fr_pct:.1f}%)")



Loading source data for synthetic translation generation...
Data source: FULL

Loaded 3294856 English samples
Loaded 3294856 English samples
Loaded 484003 French samples

Total available data: 3,778,859 samples
Loaded 484003 French samples

Total available data: 3,778,859 samples
Available by language:
  English: 3,294,856 samples
  French: 484,003 samples
Available by language:
  English: 3,294,856 samples
  French: 484,003 samples

Sampled 20,000 samples for generation:
  English to Arabic: 10,000 (50.0%)
  French to Arabic: 10,000 (50.0%)

Sampled 20,000 samples for generation:
  English to Arabic: 10,000 (50.0%)
  French to Arabic: 10,000 (50.0%)


## Generate Translation Candidates

In [6]:
# ===========================
# GENERATION CONFIGURATION
# ===========================
# OPTIMIZED FOR DUAL RTX 5090 (62GB VRAM)
MEGA_BATCH_SIZE = 16  # Increased from 4 to 16 (4x improvement)
NUM_CANDIDATES = 1     # Keep 1 candidate per method for speed
MAX_NEW_TOKENS = 128

print("\nGeneration Configuration (OPTIMIZED):")
print(f"  Batch size: {MEGA_BATCH_SIZE} (increased for dual GPU)")
print(f"  Candidates per source: {NUM_CANDIDATES}")
print(f"  Max tokens: {MAX_NEW_TOKENS}")
print(f"  Methods: 4 (temperature, top-k, nucleus, greedy)")
print(f"  Expected speedup: 3-4x faster with larger batches")

print("\nGPU Utilization Mode:")
print("  Model parallelism: Layers distributed across both GPUs")
print("  Each batch processes through both GPUs sequentially")
print("  Larger batches = better GPU utilization")



Generation Configuration (OPTIMIZED):
  Batch size: 16 (increased for dual GPU)
  Candidates per source: 1
  Max tokens: 128
  Methods: 4 (temperature, top-k, nucleus, greedy)
  Expected speedup: 3-4x faster with larger batches

GPU Utilization Mode:
  Model parallelism: Layers distributed across both GPUs
  Each batch processes through both GPUs sequentially
  Larger batches = better GPU utilization


In [7]:
# ===========================
# TRANSLATION GENERATION METHODS
# ===========================
# Four different sampling strategies for diverse translation candidates

def generate_with_temperature(sources, langs, num_candidates=1):
    """High temperature sampling for diverse outputs"""
    prompts = [format_translation_prompt(src, lang) for src, lang in zip(sources, langs)]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=1.2,
        top_p=0.95,
        top_k=50,
        num_return_sequences=num_candidates,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    candidates = []
    
    for i in range(0, len(generated_texts), num_candidates):
        batch_candidates = []
        for j in range(num_candidates):
            if i + j < len(generated_texts):
                text = generated_texts[i + j]
                translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                batch_candidates.append({
                    'translation': translation,
                    'method': 'temperature_sampling',
                    'config': {'temperature': 1.2, 'top_p': 0.95, 'top_k': 50}
                })
        candidates.append(batch_candidates)
    return candidates


def generate_with_topk(sources, langs, num_candidates=1):
    """Conservative top-k sampling"""
    prompts = [format_translation_prompt(src, lang) for src, lang in zip(sources, langs)]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_k=30,
        top_p=0.9,
        num_return_sequences=num_candidates,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    candidates = []
    
    for i in range(0, len(generated_texts), num_candidates):
        batch_candidates = []
        for j in range(num_candidates):
            if i + j < len(generated_texts):
                text = generated_texts[i + j]
                translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                batch_candidates.append({
                    'translation': translation,
                    'method': 'top_k_sampling',
                    'config': {'temperature': 0.7, 'top_k': 30, 'top_p': 0.9}
                })
        candidates.append(batch_candidates)
    return candidates


def generate_with_nucleus(sources, langs, num_candidates=1):
    """Nucleus (top-p) sampling for balanced diversity"""
    prompts = [format_translation_prompt(src, lang) for src, lang in zip(sources, langs)]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.9,
        top_p=0.95,
        top_k=0,
        num_return_sequences=num_candidates,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    candidates = []
    
    for i in range(0, len(generated_texts), num_candidates):
        batch_candidates = []
        for j in range(num_candidates):
            if i + j < len(generated_texts):
                text = generated_texts[i + j]
                translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                batch_candidates.append({
                    'translation': translation,
                    'method': 'nucleus_sampling',
                    'config': {'temperature': 0.9, 'top_p': 0.95}
                })
        candidates.append(batch_candidates)
    return candidates


def generate_with_greedy(sources, langs, num_candidates=1):
    """Greedy decoding for consistent outputs"""
    prompts = [format_translation_prompt(src, lang) for src, lang in zip(sources, langs)]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,
        num_return_sequences=num_candidates,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    candidates = []
    
    for i in range(0, len(generated_texts), num_candidates):
        batch_candidates = []
        for j in range(num_candidates):
            if i + j < len(generated_texts):
                text = generated_texts[i + j]
                translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                batch_candidates.append({
                    'translation': translation,
                    'method': 'greedy_decoding',
                    'config': {'do_sample': False}
                })
        candidates.append(batch_candidates)
    return candidates


GENERATION_METHODS = {
    'temperature_sampling': generate_with_temperature,
    'top_k_sampling': generate_with_topk,
    'nucleus_sampling': generate_with_nucleus,
    'greedy_decoding': generate_with_greedy
}

print("\nGeneration methods configured:")
print("  1. Temperature Sampling (high randomness)")
print("  2. Top-K Sampling (conservative)")
print("  3. Nucleus Sampling (balanced)")
print("  4. Greedy Decoding (deterministic)")



Generation methods configured:
  1. Temperature Sampling (high randomness)
  2. Top-K Sampling (conservative)
  3. Nucleus Sampling (balanced)
  4. Greedy Decoding (deterministic)


## Alternative: vLLM for Better Multi-GPU Utilization (Optional)

**Note on GPU usage:**
- Current setup: Model layers distributed across 2 GPUs (tensor parallelism)
- vLLM: Optimized tensor parallelism + better batching = 3-10x faster
- Install: `pip install vllm`

**Before enabling vLLM, verify your system:**
- CUDA version: 11.8 or higher required
- GPU memory: Will be cleared before vLLM loads
- Driver: Latest NVIDIA drivers recommended

**To activate vLLM:**
1. Make sure transformers model is loaded (run cell 5 if needed)
2. Run the vLLM configuration cell below (it will unload transformers automatically)
3. If it fails, it will automatically fall back to transformers


In [21]:
# ===========================
# OPTIONAL: vLLM FOR OPTIMIZED MULTI-GPU INFERENCE
# ===========================
# vLLM provides better GPU utilization than standard transformers
# IMPORTANT: This model may not be fully compatible with vLLM
# Transformers will work perfectly fine - vLLM is just an optimization

USE_VLLM = True  # RECOMMENDED: Keep False for this model
VLLM_ACTIVE = False

if USE_VLLM:
    try:
        # Set environment variables BEFORE importing vLLM
        import os
        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
        os.environ['VLLM_LOGGING_LEVEL'] = 'DEBUG'  # More verbose for debugging
        
        from vllm import LLM, SamplingParams
        
        print("=" * 60)
        print("ATTEMPTING vLLM INITIALIZATION")
        print("=" * 60)
        print("Note: This model may have compatibility issues with vLLM")
        print("If it fails, transformers will work perfectly fine!")
        print()
        
        # Unload transformers model
        if 'model' in globals():
            del model
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
            print("‚úì Transformers model unloaded")
        
        # Clear CUDA contexts
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            for i in range(torch.cuda.device_count()):
                torch.cuda.reset_peak_memory_stats(i)
                torch.cuda.reset_accumulated_memory_stats(i)
            print("‚úì CUDA contexts cleared")
        
        print("\nInitializing vLLM (this takes 1-2 minutes)...")
        print("Using conservative settings for compatibility...")
        
        # Try with minimal settings first
        vllm_model = LLM(
            model="ModelSpace/GemmaX2-28-9B-v0.1",
            tensor_parallel_size=2,
            dtype="float16",  # Use float16 instead of bfloat16 for better compatibility
            max_model_len=256,  # Reduced from 512
            trust_remote_code=True,
            gpu_memory_utilization=0.80,  # Conservative 80%
            swap_space=8,  # More swap space
            max_num_seqs=64,  # Reduced batch capacity
            enforce_eager=True,  # Disable CUDA graphs for compatibility
            disable_log_stats=True,
        )
        
        print("\n‚úì vLLM model loaded!")
        
        # Define generation functions
        def generate_with_vllm_temperature(batch_sources, batch_langs, num_candidates=1):
            prompts = [format_translation_prompt(src, lang) 
                       for src, lang in zip(batch_sources, batch_langs)]
            sampling_params = SamplingParams(
                temperature=1.2, top_p=0.95, top_k=50,
                max_tokens=MAX_NEW_TOKENS, n=num_candidates,
                skip_special_tokens=True
            )
            outputs = vllm_model.generate(prompts, sampling_params)
            all_candidates = []
            for output in outputs:
                candidates = []
                for completion in output.outputs:
                    text = completion.text
                    translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                    candidates.append({
                        'translation': translation,
                        'method': 'temperature_sampling',
                        'config': {'temperature': 1.2, 'top_p': 0.95, 'top_k': 50}
                    })
                all_candidates.append(candidates)
            return all_candidates
        
        def generate_with_vllm_topk(batch_sources, batch_langs, num_candidates=1):
            prompts = [format_translation_prompt(src, lang) 
                       for src, lang in zip(batch_sources, batch_langs)]
            sampling_params = SamplingParams(
                temperature=0.7, top_p=0.9, top_k=30,
                max_tokens=MAX_NEW_TOKENS, n=num_candidates,
                skip_special_tokens=True
            )
            outputs = vllm_model.generate(prompts, sampling_params)
            all_candidates = []
            for output in outputs:
                candidates = []
                for completion in output.outputs:
                    text = completion.text
                    translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                    candidates.append({
                        'translation': translation,
                        'method': 'top_k_sampling',
                        'config': {'temperature': 0.7, 'top_k': 30, 'top_p': 0.9}
                    })
                all_candidates.append(candidates)
            return all_candidates
        
        def generate_with_vllm_nucleus(batch_sources, batch_langs, num_candidates=1):
            prompts = [format_translation_prompt(src, lang) 
                       for src, lang in zip(batch_sources, batch_langs)]
            sampling_params = SamplingParams(
                temperature=0.9, top_p=0.95, top_k=-1,
                max_tokens=MAX_NEW_TOKENS, n=num_candidates,
                skip_special_tokens=True
            )
            outputs = vllm_model.generate(prompts, sampling_params)
            all_candidates = []
            for output in outputs:
                candidates = []
                for completion in output.outputs:
                    text = completion.text
                    translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                    candidates.append({
                        'translation': translation,
                        'method': 'nucleus_sampling',
                        'config': {'temperature': 0.9, 'top_p': 0.95}
                    })
                all_candidates.append(candidates)
            return all_candidates
        
        def generate_with_vllm_greedy(batch_sources, batch_langs, num_candidates=1):
            prompts = [format_translation_prompt(src, lang) 
                       for src, lang in zip(batch_sources, batch_langs)]
            sampling_params = SamplingParams(
                temperature=0.0, max_tokens=MAX_NEW_TOKENS,
                n=num_candidates, skip_special_tokens=True
            )
            outputs = vllm_model.generate(prompts, sampling_params)
            all_candidates = []
            for output in outputs:
                candidates = []
                for completion in output.outputs:
                    text = completion.text
                    translation = text.split("Arabic translation:")[-1].strip() if "Arabic translation:" in text else text.strip()
                    candidates.append({
                        'translation': translation,
                        'method': 'greedy_decoding',
                        'config': {'do_sample': False}
                    })
                all_candidates.append(candidates)
            return all_candidates
        
        GENERATION_METHODS = {
            'temperature_sampling': generate_with_vllm_temperature,
            'top_k_sampling': generate_with_vllm_topk,
            'nucleus_sampling': generate_with_vllm_nucleus,
            'greedy_decoding': generate_with_vllm_greedy
        }
        
        VLLM_ACTIVE = True
        print("\n" + "=" * 60)
        print("‚úì‚úì‚úì vLLM ACTIVATED SUCCESSFULLY! ‚úì‚úì‚úì")
        print("=" * 60)
        print("Speedup: 3-10x faster than transformers")
        print("Tensor parallelism: Enabled across both GPUs")
        print("=" * 60)
        
    except ImportError as e:
        print("\n" + "=" * 60)
        print("vLLM NOT INSTALLED")
        print("=" * 60)
        print(f"Error: {e}")
        print("\nInstall: pip install vllm")
        print("Using transformers instead (works great!)")
        print("=" * 60)
        VLLM_ACTIVE = False
        
    except Exception as e:
        print("\n" + "=" * 60)
        print("vLLM INITIALIZATION FAILED (THIS IS NORMAL)")
        print("=" * 60)
        error_msg = str(e)
        print(f"Error: {type(e).__name__}")
        if len(error_msg) < 500:
            print(f"Details: {error_msg}")
        
        print("\nüîç Likely causes:")
        print("  ‚Ä¢ This specific model isn't fully compatible with vLLM")
        print("  ‚Ä¢ Model architecture not supported by current vLLM version")
        print("  ‚Ä¢ Custom model code causes initialization issues")
        
        print("\n‚úÖ SOLUTION: Use transformers (recommended for this model)")
        print("  ‚Ä¢ Already loaded and working perfectly")
        print("  ‚Ä¢ Fully supports this model architecture")
        print("  ‚Ä¢ Utilizes both GPUs via model parallelism")
        print("  ‚Ä¢ Proven stable for long-running generation tasks")
        
        print("\nüí° Performance tip:")
        print("  ‚Ä¢ Your dual RTX 5090 setup with transformers is excellent")
        print("  ‚Ä¢ Batch size of 16 already optimized for your hardware")
        print("  ‚Ä¢ Generation will complete efficiently")
        print("=" * 60)
        VLLM_ACTIVE = False
        
        if 'model' not in globals():
            print("\n‚ö†Ô∏è  Re-run cell 5 to load transformers model")
else:
    print("=" * 60)
    print("USING TRANSFORMERS (RECOMMENDED)")
    print("=" * 60)
    print("‚úì Stable and fully compatible with this model")
    print("‚úì Model parallelism across both RTX 5090 GPUs")
    print("‚úì Optimized with 8-bit quantization")
    print("‚úì Large batch size (16) for efficiency")
    print()
    print("Note: vLLM is disabled (USE_VLLM=False)")
    print("  This model has known compatibility issues with vLLM")
    print("  Transformers is the recommended approach here")
    print("=" * 60)
    VLLM_ACTIVE = False


ATTEMPTING vLLM INITIALIZATION
Note: This model may have compatibility issues with vLLM
If it fails, transformers will work perfectly fine!

‚úì CUDA contexts cleared

Initializing vLLM (this takes 1-2 minutes)...
Using conservative settings for compatibility...
INFO 12-11 11:40:13 [utils.py:253] non-default args: {'trust_remote_code': True, 'dtype': 'float16', 'seed': None, 'max_model_len': 256, 'tensor_parallel_size': 2, 'swap_space': 8, 'gpu_memory_utilization': 0.8, 'max_num_seqs': 64, 'disable_log_stats': True, 'enforce_eager': True, 'model': 'ModelSpace/GemmaX2-28-9B-v0.1'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 12-11 11:40:14 [model.py:637] Resolved architecture: Gemma2ForCausalLM

vLLM INITIALIZATION FAILED (THIS IS NORMAL)
Error: ValidationError
Details: 1 validation error for ModelConfig
  Value error, The model type 'gemma2' does not support float16. Reason: Numerical instability. Please use bfloat16 or float32 instead. [type=value_error, input_value=ArgsKwargs((), {'model': ...rocessor_plugin': None}), input_type=ArgsKwargs]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error

üîç Likely causes:
  ‚Ä¢ This specific model isn't fully compatible with vLLM
  ‚Ä¢ Model architecture not supported by current vLLM version
  ‚Ä¢ Custom model code causes initialization issues

‚úÖ SOLUTION: Use transformers (recommended for this model)
  ‚Ä¢ Already loaded and working perfectly
  ‚Ä¢ Fully supports this model architecture
  ‚Ä¢ Utilizes both GPUs via model parallelism
  ‚Ä¢ Proven stable for long-running generation tasks

üí° Performance tip:
  ‚Ä¢ Your dual RT

In [16]:
# Verify GPU utilization before generation
if torch.cuda.is_available() and not FORCE_CPU:
    print("\nCurrent GPU Status:")
    print("=" * 60)
    for i in range(NUM_GPUS):
        props = torch.cuda.get_device_properties(i)
        allocated = torch.cuda.memory_allocated(i) / 1e9
        reserved = torch.cuda.memory_reserved(i) / 1e9
        total = props.total_memory / 1e9
        free = total - reserved
        
        print(f"GPU {i}: {props.name}")
        print(f"  Total: {total:.1f}GB")
        print(f"  Allocated: {allocated:.1f}GB ({100*allocated/total:.1f}%)")
        print(f"  Reserved: {reserved:.1f}GB ({100*reserved/total:.1f}%)")
        print(f"  Free: {free:.1f}GB ({100*free/total:.1f}%)")
    print("=" * 60)
    
    # Check engine and model distribution
    if VLLM_ACTIVE:
        print("\nInference Engine: vLLM (Optimized)")
        print("  Tensor parallelism across both GPUs")
        print("  Continuous batching enabled")
        print("  PagedAttention for memory efficiency")
    elif 'model' in globals() and hasattr(model, 'hf_device_map'):
        print("\nInference Engine: Transformers (Standard)")
        gpu0_params = sum(1 for device in model.hf_device_map.values() if str(device) == '0')
        gpu1_params = sum(1 for device in model.hf_device_map.values() if str(device) == '1')
        total_mapped = len(model.hf_device_map)
        
        print(f"  GPU 0: {gpu0_params}/{total_mapped} components ({100*gpu0_params/total_mapped:.1f}%)")
        print(f"  GPU 1: {gpu1_params}/{total_mapped} components ({100*gpu1_params/total_mapped:.1f}%)")
        
        if gpu0_params > 0 and gpu1_params > 0:
            print("  Status: Model split across both GPUs")
        elif gpu0_params > 0 or gpu1_params > 0:
            print("  Warning: Model on single GPU only")
        else:
            print("  Warning: Model distribution unclear")
    print("=" * 60)



Current GPU Status:
GPU 0: NVIDIA GeForce RTX 5090
  Total: 33.7GB
  Allocated: 4.8GB (14.3%)
  Reserved: 5.1GB (15.1%)
  Free: 28.6GB (84.9%)
GPU 1: NVIDIA GeForce RTX 5090
  Total: 33.7GB
  Allocated: 6.5GB (19.4%)
  Reserved: 6.9GB (20.4%)
  Free: 26.8GB (79.6%)


## Score Translations

In [10]:
# ===========================
# TRANSLATION QUALITY SCORING
# ===========================

def score_batch_candidates(sources: list, candidates_list: list) -> list:
    """Score translation candidates using heuristics.
    
    Metrics: length ratio, punctuation presence, non-empty validation
    Returns: List of scored candidates with 'score' and 'method' keys
    """
    scored_batch = []
    
    for src, candidates in zip(sources, candidates_list):
        src_len = len(src.split())
        src_punct = sum(1 for c in src if c in '.!?,;:')
        
        scored_candidates = []
        for cand in candidates:
            tgt = cand['translation']
            method = cand.get('method', 'unknown')
            
            # Length ratio score
            tgt_len = len(tgt.split())
            if src_len > 0:
                length_ratio = min(tgt_len, src_len) / max(tgt_len, src_len)
            else:
                length_ratio = 0.5 if tgt_len == 0 else 0.0
            
            # Punctuation presence score
            tgt_punct = sum(1 for c in tgt if c in '.!?,;:')
            punct_score = 1.0 if (src_punct > 0 and tgt_punct > 0) or (src_punct == 0 and tgt_punct == 0) else 0.7
            
            # Non-empty score
            non_empty_score = 1.0 if len(tgt.strip()) > 0 and '[ERROR]' not in tgt else 0.0
            
            # Combined score
            quality_score = (length_ratio * 0.5 + punct_score * 0.3 + non_empty_score * 0.2)
            quality_score = max(0.0, min(1.0, quality_score))
            
            scored_candidates.append({
                'translation': tgt,
                'method': method,
                'score': quality_score
            })
        
        scored_batch.append(scored_candidates)
    
    return scored_batch


def create_preference_pairs(scored_candidates: list) -> list:
    """Create preference pairs from scored candidates.
    Pairs highest-scored with lowest-scored translations.
    """
    pairs = []
    
    if len(scored_candidates) < 2:
        return pairs
    
    sorted_cands = sorted(scored_candidates, key=lambda x: x['score'], reverse=True)
    
    if len(sorted_cands) >= 2:
        chosen = sorted_cands[0]
        rejected = sorted_cands[-1]
        score_margin = chosen['score'] - rejected['score']
        
        if score_margin > 0.01:
            pairs.append({
                'chosen': chosen['translation'],
                'rejected': rejected['translation'],
                'chosen_score': chosen['score'],
                'rejected_score': rejected['score'],
                'score_margin': score_margin,
                'chosen_method': chosen['method'],
                'rejected_method': rejected['method']
            })
    
    return pairs

print("Scoring functions configured (heuristic-based)")


Scoring functions configured (heuristic-based)


## Generate Synthetic Preference Dataset

In [11]:
# Test scoring functions
print("Testing scoring and preference pair functions...\n")

test_sources = ["Hello world", "This is a test"]
test_candidates = [
    [
        {'translation': 'Hello world', 'method': 'greedy_decoding'},
        {'translation': 'Hi there', 'method': 'temperature_sampling'},
        {'translation': '[ERROR]', 'method': 'nucleus_sampling'}
    ],
    [
        {'translation': 'This is a test', 'method': 'top_k_sampling'},
        {'translation': 'This is testing', 'method': 'greedy_decoding'},
    ]
]

scored_batch = score_batch_candidates(test_sources, test_candidates)

print("Scored candidates:")
for i, scored_list in enumerate(scored_batch):
    print(f"  Source {i+1}:")
    for s in scored_list:
        print(f"    '{s['translation'][:30]}' ({s['method']}): {s['score']:.3f}")

pairs = create_preference_pairs(scored_batch[0])
print("\nPreference pairs created:")
for pair in pairs:
    print(f"  Chosen: '{pair['chosen'][:30]}' ({pair['chosen_score']:.3f})")
    print(f"  Rejected: '{pair['rejected'][:30]}' ({pair['rejected_score']:.3f})")
    print(f"  Margin: {pair['score_margin']:.3f}")

print("\nFunctions working correctly!")


Testing scoring and preference pair functions...

Scored candidates:
  Source 1:
    'Hello world' (greedy_decoding): 1.000
    'Hi there' (temperature_sampling): 1.000
    '[ERROR]' (nucleus_sampling): 0.550
  Source 2:
    'This is a test' (top_k_sampling): 1.000
    'This is testing' (greedy_decoding): 0.875

Preference pairs created:
  Chosen: 'Hello world' (1.000)
  Rejected: '[ERROR]' (0.550)
  Margin: 0.450

Functions working correctly!


In [None]:
# ===========================
# MAIN GENERATION AND SCORING LOOP
# ===========================

print("\n" + "=" * 80)
print("MULTI-METHOD SYNTHETIC DATA GENERATION")
print(f"Inference Engine: {'vLLM (Optimized)' if VLLM_ACTIVE else 'Transformers (Standard)'}")
print(f"Scoring: {'COMET-based' if USE_COMET else 'Heuristic-based'}")
print("=" * 80)

print(f"\nConfiguration:")
print(f"  Total samples: {len(training_samples):,}")
print(f"  Batch size: {MEGA_BATCH_SIZE}")
print(f"  Generation methods: 4")
print(f"  GPUs: {NUM_GPUS}")
print(f"  Checkpoint frequency: Every 1000 sentences")

if VLLM_ACTIVE:
    print(f"\nvLLM Optimizations:")
    print(f"  Tensor parallelism: Enabled")
    print(f"  Continuous batching: Enabled")
    print(f"  Expected speedup: 3-10x vs transformers")

en_count = sum(1 for s in training_samples if s['source_lang'] == 'en')
fr_count = sum(1 for s in training_samples if s['source_lang'] == 'fr')
print(f"\nLanguage distribution:")
print(f"  English to Arabic: {en_count:,} ({100*en_count/len(training_samples):.1f}%)")
print(f"  French to Arabic: {fr_count:,} ({100*fr_count/len(training_samples):.1f}%)")
print("=" * 80)

# Checkpoint configuration
CHECKPOINT_FREQUENCY = 1000  # Save every 1000 sentences
checkpoint_dir = DATA_DIR / "checkpoints"
checkpoint_dir.mkdir(exist_ok=True)

method_stats = {method_name: {'count': 0, 'avg_score': 0, 'scores': []} for method_name in GENERATION_METHODS.keys()}
synthetic_dataset = []
combined_candidates_path = DATA_DIR / "generated_candidates_all_methods.jsonl"

start_time = time.time()
samples_processed = 0
errors_count = 0
quality_scores_collected = []
en_pairs_count = 0
fr_pairs_count = 0
last_checkpoint = 0

num_batches = (len(training_samples) + MEGA_BATCH_SIZE - 1) // MEGA_BATCH_SIZE

with open(combined_candidates_path, 'w', encoding='utf-8') as combined_f:
    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = batch_idx * MEGA_BATCH_SIZE
        end_idx = min(start_idx + MEGA_BATCH_SIZE, len(training_samples))
        batch_samples = training_samples[start_idx:end_idx]
        
        batch_sources = [s['source'] for s in batch_samples]
        batch_langs = [s['source_lang'] for s in batch_samples]
        
        try:
            # Generate with all methods (vLLM or transformers based on VLLM_ACTIVE flag)
            all_method_candidates = {}
            for method_name, method_func in GENERATION_METHODS.items():
                try:
                    candidates = method_func(batch_sources, batch_langs, num_candidates=NUM_CANDIDATES)
                    all_method_candidates[method_name] = candidates
                except Exception as e:
                    print(f"\nError in method {method_name}: {e}")
                    all_method_candidates[method_name] = [[{'translation': '[ERROR]', 'method': method_name, 'config': {}}] for _ in batch_samples]
            
            # Combine candidates
            combined_candidates = []
            for src_idx in range(len(batch_samples)):
                src_all_candidates = []
                for method_name in GENERATION_METHODS.keys():
                    if src_idx < len(all_method_candidates[method_name]):
                        src_all_candidates.extend(all_method_candidates[method_name][src_idx])
                combined_candidates.append(src_all_candidates)
            
            # Score candidates
            scored_batch = score_batch_candidates(batch_sources, combined_candidates)
            
            # Process each sample
            for sample, candidates, scored in zip(batch_samples, combined_candidates, scored_batch):
                combined_record = {
                    'source': sample['source'],
                    'source_lang': sample['source_lang'],
                    'candidates_by_method': {}
                }
                
                for method_name in GENERATION_METHODS.keys():
                    method_cands = [c for c in candidates if c.get('method') == method_name]
                    combined_record['candidates_by_method'][method_name] = [c['translation'] for c in method_cands]
                    method_stats[method_name]['count'] += 1
                
                combined_f.write(json.dumps(combined_record, ensure_ascii=False) + '\n')
                
                # Collect scores
                for s in scored:
                    quality_scores_collected.append(s['score'])
                    method = s.get('method', 'unknown')
                    if method in method_stats:
                        method_stats[method]['scores'].append(s['score'])
                
                # Create preference pairs
                pairs = create_preference_pairs(scored)
                for pair in pairs:
                    pair_record = {
                        'source': sample['source'],
                        'source_lang': sample['source_lang'],
                        'chosen': pair['chosen'],
                        'rejected': pair['rejected'],
                        'chosen_score': pair['chosen_score'],
                        'rejected_score': pair['rejected_score'],
                        'margin': pair['score_margin'],
                        'chosen_method': pair['chosen_method'],
                        'rejected_method': pair['rejected_method']
                    }
                    synthetic_dataset.append(pair_record)
                    
                    if sample['source_lang'] == 'en':
                        en_pairs_count += 1
                    elif sample['source_lang'] == 'fr':
                        fr_pairs_count += 1
            
            samples_processed += len(batch_samples)
            
            # Save checkpoint every 1000 sentences
            if samples_processed - last_checkpoint >= CHECKPOINT_FREQUENCY:
                checkpoint_path = checkpoint_dir / f"synthetic_preferences_checkpoint_{samples_processed}.jsonl"
                with open(checkpoint_path, 'w', encoding='utf-8') as f:
                    for item in synthetic_dataset:
                        f.write(json.dumps(item, ensure_ascii=False) + '\n')
                
                # Save checkpoint stats
                checkpoint_stats_path = checkpoint_dir / f"stats_checkpoint_{samples_processed}.json"
                with open(checkpoint_stats_path, 'w') as f:
                    json.dump({
                        'samples_processed': samples_processed,
                        'total_pairs': len(synthetic_dataset),
                        'en_pairs': en_pairs_count,
                        'fr_pairs': fr_pairs_count,
                        'timestamp': time.time() - start_time,
                        'engine': 'vllm' if VLLM_ACTIVE else 'transformers'
                    }, f, indent=2)
                
                print(f"\n  Checkpoint saved: {len(synthetic_dataset):,} pairs ({samples_processed:,} samples)")
                last_checkpoint = samples_processed
            
            # Clear GPU cache periodically
            if batch_idx % 50 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
                
        except Exception as e:
            errors_count += 1
            if errors_count <= 5:
                print(f"\nError in batch {batch_idx}: {e}")
            continue
        
        # Progress update every 50 batches
        if (batch_idx + 1) % 50 == 0:
            elapsed = time.time() - start_time
            rate = samples_processed / elapsed if elapsed > 0 else 0
            remaining = (len(training_samples) - samples_processed) / rate if rate > 0 else 0
            
            if quality_scores_collected:
                avg_score = sum(quality_scores_collected) / len(quality_scores_collected)
            else:
                avg_score = 0
            
            print(f"\nProgress (Batch {batch_idx + 1}/{num_batches}):")
            print(f"  Engine: {'vLLM' if VLLM_ACTIVE else 'Transformers'}")
            print(f"  Samples: {samples_processed:,}/{len(training_samples):,} ({100*samples_processed/len(training_samples):.1f}%)")
            print(f"  Preference pairs: {len(synthetic_dataset):,}")
            print(f"    EN->AR: {en_pairs_count:,}, FR->AR: {fr_pairs_count:,}")
            print(f"  Avg quality score: {avg_score:.4f}")
            print(f"  Rate: {rate:.1f} samples/sec")
            print(f"  ETA: {remaining/3600:.2f} hours")
            print(f"  Errors: {errors_count}")
            print(f"  Last checkpoint: {last_checkpoint:,} samples")
        
        combined_f.flush()

# Save final checkpoint
final_checkpoint_path = checkpoint_dir / f"synthetic_preferences_final_{samples_processed}.jsonl"
with open(final_checkpoint_path, 'w', encoding='utf-8') as f:
    for item in synthetic_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"\nFinal checkpoint saved: {final_checkpoint_path.name}")

# Finalize statistics
for method_name in GENERATION_METHODS.keys():
    if method_stats[method_name]['scores']:
        method_stats[method_name]['avg_score'] = sum(method_stats[method_name]['scores']) / len(method_stats[method_name]['scores'])

total_time = time.time() - start_time

print("\n" + "=" * 80)
print("GENERATION COMPLETE")
print("=" * 80)
print(f"  Engine: {'vLLM' if VLLM_ACTIVE else 'Transformers'}")
print(f"  Samples processed: {samples_processed:,}")
print(f"  Preference pairs: {len(synthetic_dataset):,}")
print(f"    English to Arabic: {en_pairs_count:,}")
print(f"    French to Arabic: {fr_pairs_count:,}")
print(f"  Total time: {total_time/3600:.2f} hours")
print(f"  Average rate: {samples_processed/total_time:.1f} samples/sec")
print(f"  Errors: {errors_count}")
print(f"  Checkpoints saved: {checkpoint_dir.name}/")

if quality_scores_collected:
    overall_avg = sum(quality_scores_collected) / len(quality_scores_collected)
    print(f"\nOverall avg quality score: {overall_avg:.4f}")

print("\nMethod statistics:")
for method_name in GENERATION_METHODS.keys():
    print(f"  {method_name}: avg={method_stats[method_name]['avg_score']:.4f}")



MULTI-METHOD SYNTHETIC DATA GENERATION
Scoring: Heuristic-based

Configuration:
  Total samples: 20,000
  Batch size: 16
  Generation methods: 4
  GPUs: 2
  Checkpoint frequency: Every 1000 sentences

Language distribution:
  English to Arabic: 10,000 (50.0%)
  French to Arabic: 10,000 (50.0%)


Processing batches:   0%|          | 0/1250 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing batches:   0%|          | 0/1250 [00:15<?, ?it/s]


KeyboardInterrupt: 

## Save Synthetic Dataset

In [None]:
# Save synthetic dataset
print(f"Saving dataset to {SYNTHETIC_PREFERENCES}...\n")

with open(SYNTHETIC_PREFERENCES, 'w', encoding='utf-8') as f:
    for item in synthetic_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Saved {len(synthetic_dataset)} preference pairs")

# Calculate statistics by language
en_pairs = [item for item in synthetic_dataset if item['source_lang'] == 'en']
fr_pairs = [item for item in synthetic_dataset if item['source_lang'] == 'fr']

# Analyze which methods produced best pairs
method_pair_stats = {}
for method_name in GENERATION_METHODS.keys():
    chosen_count = sum(1 for item in synthetic_dataset if item['chosen_method'] == method_name)
    rejected_count = sum(1 for item in synthetic_dataset if item['rejected_method'] == method_name)
    method_pair_stats[method_name] = {
        'chosen_count': chosen_count,
        'rejected_count': rejected_count
    }

# Prepare statistics
stats = {
    'total_pairs': len(synthetic_dataset),
    'en_pairs': len(en_pairs),
    'fr_pairs': len(fr_pairs),
    'avg_margin': sum(item['margin'] for item in synthetic_dataset) / len(synthetic_dataset) if synthetic_dataset else 0,
    'avg_chosen_score': sum(item['chosen_score'] for item in synthetic_dataset) / len(synthetic_dataset) if synthetic_dataset else 0,
    'avg_rejected_score': sum(item['rejected_score'] for item in synthetic_dataset) / len(synthetic_dataset) if synthetic_dataset else 0,
    'num_sources': len(training_samples),
    'language_breakdown': {
        'english': {
            'pairs': len(en_pairs),
            'avg_margin': sum(item['margin'] for item in en_pairs) / len(en_pairs) if en_pairs else 0,
            'avg_chosen_score': sum(item['chosen_score'] for item in en_pairs) / len(en_pairs) if en_pairs else 0,
        },
        'french': {
            'pairs': len(fr_pairs),
            'avg_margin': sum(item['margin'] for item in fr_pairs) / len(fr_pairs) if fr_pairs else 0,
            'avg_chosen_score': sum(item['chosen_score'] for item in fr_pairs) / len(fr_pairs) if fr_pairs else 0,
        }
    },
    'method_breakdown': {
        method_name: {
            'pairs_as_chosen': method_pair_stats[method_name]['chosen_count'],
            'pairs_as_rejected': method_pair_stats[method_name]['rejected_count'],
            'avg_score': method_stats[method_name]['avg_score'],
            'total_candidates': method_stats[method_name]['count']
        }
        for method_name in GENERATION_METHODS.keys()
    },
    'data_source': 'full' if not USE_SAMPLES else 'samples',
    'scoring_method': 'comet' if USE_COMET else 'heuristic'
}

stats_path = DATA_DIR / "synthetic_data_stats.json"
with open(stats_path, 'w') as f:
    json.dump(stats, f, indent=2)

# Print summary
print("\nDataset Statistics:")
print(f"  Total pairs: {stats['total_pairs']:,}")
print(f"  Average margin: {stats['avg_margin']:.4f}")
print(f"  Average chosen score: {stats['avg_chosen_score']:.4f}")
print(f"  Number of sources: {stats['num_sources']:,}")

print(f"\nLanguage Breakdown:")
print(f"  English to Arabic: {stats['language_breakdown']['english']['pairs']:,} pairs")
print(f"    Avg margin: {stats['language_breakdown']['english']['avg_margin']:.4f}")
print(f"    Avg score: {stats['language_breakdown']['english']['avg_chosen_score']:.4f}")

print(f"  French to Arabic: {stats['language_breakdown']['french']['pairs']:,} pairs")
print(f"    Avg margin: {stats['language_breakdown']['french']['avg_margin']:.4f}")
print(f"    Avg score: {stats['language_breakdown']['french']['avg_chosen_score']:.4f}")

print(f"\nMethod Breakdown:")
for method_name, info in stats['method_breakdown'].items():
    print(f"  {method_name}:")
    print(f"    Chosen: {info['pairs_as_chosen']:,}")
    print(f"    Rejected: {info['pairs_as_rejected']:,}")
    print(f"    Avg score: {info['avg_score']:.4f}")

print(f"\nGenerated files:")
print(f"  - generated_candidates_all_methods.jsonl")
print(f"  - {SYNTHETIC_PREFERENCES.name}")
print(f"  - {stats_path.name}")

print(f"\nStatistics saved to {stats_path}")


Saving dataset to data/synthetic_preferences.jsonl...

Saved 0 preference pairs

Dataset Statistics:
  Total pairs: 0
  Average margin: 0.0000
  Average chosen score: 0.0000
  Number of sources: 20,000

Language Breakdown:
  English to Arabic: 0 pairs
    Avg margin: 0.0000
    Avg score: 0.0000
  French to Arabic: 0 pairs
    Avg margin: 0.0000
    Avg score: 0.0000

Method Breakdown:
  temperature_sampling:
    Chosen: 0
    Rejected: 0
    Avg score: 0.2468
  top_k_sampling:
    Chosen: 0
    Rejected: 0
    Avg score: 0.2468
  nucleus_sampling:
    Chosen: 0
    Rejected: 0
    Avg score: 0.2468
  greedy_decoding:
    Chosen: 0
    Rejected: 0
    Avg score: 0.2468

Generated files:
  - generated_candidates_all_methods.jsonl
  - synthetic_preferences.jsonl
  - synthetic_data_stats.json

Statistics saved to data/synthetic_data_stats.json


## Sample Preference Pairs

In [None]:
# Display sample preference pairs
print("Sample Preference Pairs\n")
print("=" * 80)

en_examples = [item for item in synthetic_dataset if item['source_lang'] == 'en']
fr_examples = [item for item in synthetic_dataset if item['source_lang'] == 'fr']

examples_to_show = []
if en_examples:
    examples_to_show.append(('English to Arabic', random.sample(en_examples, min(1, len(en_examples)))[0]))
if fr_examples:
    examples_to_show.append(('French to Arabic', random.sample(fr_examples, min(1, len(fr_examples)))[0]))

if len(examples_to_show) < 3:
    remaining_count = 3 - len(examples_to_show)
    all_remaining = [item for item in synthetic_dataset if item not in [ex[1] for ex in examples_to_show]]
    for item in random.sample(all_remaining, min(remaining_count, len(all_remaining))):
        lang_label = 'English to Arabic' if item['source_lang'] == 'en' else 'French to Arabic'
        examples_to_show.append((lang_label, item))

for i, (lang_label, item) in enumerate(examples_to_show, 1):
    print(f"\nExample {i}: {lang_label}")
    print(f"Source: {item['source'][:150]}")
    print(f"\nChosen (score: {item['chosen_score']:.3f}, {item['chosen_method']}): {item['chosen'][:150]}")
    print(f"\nRejected (score: {item['rejected_score']:.3f}, {item['rejected_method']}): {item['rejected'][:150]}")
    print(f"Margin: {item['margin']:.3f}")
    print("=" * 80)

print("\nMethod Performance Summary:")
print("=" * 80)
for method_name in GENERATION_METHODS.keys():
    chosen = sum(1 for item in synthetic_dataset if item['chosen_method'] == method_name)
    rejected = sum(1 for item in synthetic_dataset if item['rejected_method'] == method_name)
    total = chosen + rejected
    if total > 0:
        chosen_pct = 100 * chosen / total
        print(f"{method_name}:")
        print(f"  Chosen: {chosen:,} ({chosen_pct:.1f}%)")
        print(f"  Rejected: {rejected:,} ({100-chosen_pct:.1f}%)")
        print(f"  Total: {total:,}")
print("=" * 80)


Sample Preference Pairs


Method Performance Summary:


## Next Step

Proceed to **notebook 2** to train the reward model using this synthetic preference data.

## Quick Process: Convert Partial Data to Preferences

Run this cell to process the candidates you already generated (without re-running generation).