In [1]:
# Install required packages
!pip install transformers accelerate torch datasets trl peft bitsandbytes
!pip install comet-ml unbabel-comet bert-score sacrebleu
!pip install wandb pandas numpy tqdm

Collecting datasets
[0m  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m482.2 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.me

In [2]:
import os
import torch
from pathlib import Path

# Check GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

GPU Available: True
GPU Name: NVIDIA GeForce RTX 5090
GPU Memory: 33.67 GB


    Found GPU0 NVIDIA GeForce RTX 5090 which is of cuda capability 12.0.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (5.0) - (9.0)
    
  queued_call()
    Please install PyTorch with a following CUDA
    configurations:  12.8 13.0 following instructions at
    https://pytorch.org/get-started/locally/
    
  queued_call()
    Found GPU1 NVIDIA GeForce RTX 5090 which is of cuda capability 12.0.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (5.0) - (9.0)
    
  queued_call()
NVIDIA GeForce RTX 5090 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  queued_call()


## Project Configuration

In [3]:
from pathlib import Path
import os
from huggingface_hub import login, hf_hub_download

# ===========================
# VIRTUAL ENV DETECTION
# ===========================
VENV_PATH = os.environ.get("VIRTUAL_ENV")
if VENV_PATH:
    print(f"Using virtual environment at: {VENV_PATH}")
else:
    print("Warning: Not running inside a virtual environment!")

# ===========================
# HUGGING FACE AUTH
# ===========================
# Uses token saved in ~/.cache/huggingface/token
HF_TOKEN_PATH = Path.home() / ".cache/huggingface/token"
if HF_TOKEN_PATH.exists():
    with open(HF_TOKEN_PATH) as f:
        hf_token = f.read().strip()
    os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
    print("Hugging Face token loaded from cache.")
else:
    print("No Hugging Face token found. Please run `huggingface-cli login`.")

# ===========================
# PROJECT PATHS
# ===========================
PROJECT_DIR = Path.home() / "Downloads" / "Reinforcement Learning"
DATA_DIR = PROJECT_DIR / "data"
MODELS_DIR = PROJECT_DIR / "models"
OUTPUTS_DIR = PROJECT_DIR / "outputs"
LOGS_DIR = PROJECT_DIR / "logs"

for dir_path in [DATA_DIR, MODELS_DIR, OUTPUTS_DIR, LOGS_DIR]:
    dir_path.mkdir(exist_ok=True, parents=True)

# ===========================
# MODEL PATHS
# ===========================
# Fine-tuned GemmaX2-28-9B (will download automatically if not present)
SFT_MODEL_PATH = MODELS_DIR / "GemmaX2-28-9B"
if not SFT_MODEL_PATH.exists():
    print("Downloading GemmaX2-28-9B model from Hugging Face...")
    # Example: you can download config files first (weights are huge)
    # For full model loading, use transformers.from_pretrained with device_map="auto"
    # Here we just ensure folder exists
    SFT_MODEL_PATH.mkdir(exist_ok=True)
    print(f"Created folder for model at {SFT_MODEL_PATH}")

# Base model for reward model
REWARD_BASE_MODEL = "google/gemma-2-2b"  # Or local path if downloaded

# ===========================
# DATA FILES
# ===========================
TEST_PROMPTS = DATA_DIR / "test_prompts.jsonl"
SYNTHETIC_PREFERENCES = DATA_DIR / "synthetic_preferences.jsonl"
HUMAN_PREFERENCES = DATA_DIR / "human_preferences.jsonl"

# ===========================
# MODEL CHECKPOINTS
# ===========================
REWARD_MODEL_COLD_START = MODELS_DIR / "reward_model_coldstart"
REWARD_MODEL_HUMAN_ALIGNED = MODELS_DIR / "reward_model_human"
PPO_MODEL_COLD_START = MODELS_DIR / "ppo_model_coldstart"
PPO_MODEL_FINAL = MODELS_DIR / "ppo_model_final"

print("Directory structure and model paths are ready!")


Using virtual environment at: /home/imane/gemma_env
Hugging Face token loaded from cache.
Downloading GemmaX2-28-9B model from Hugging Face...
Created folder for model at /home/imane/Downloads/Reinforcement Learning/models/GemmaX2-28-9B
Directory structure and model paths are ready!


  from .autonotebook import tqdm as notebook_tqdm


## Hyperparameters

In [5]:
# ===========================
# SYNTHETIC DATA GENERATION
# ===========================

NUM_CANDIDATES = 8  # Number of translation variants per input (3-8 as specified)
TEMPERATURES = [0.6, 0.8, 1.0, 1.2]  # Temperature variations
TOP_K_VALUES = [30, 50, 80]
TOP_P_VALUES = [0.85, 0.9, 0.95]

# Automatic metrics weights
METRIC_WEIGHTS = {
    'comet': 0.5,
    'bertscore': 0.3,
    'chrf': 0.2
}

# ===========================
# REWARD MODEL TRAINING
# ===========================

RM_LEARNING_RATE = 1e-5
RM_BATCH_SIZE = 8
RM_EPOCHS = 3
RM_MAX_LENGTH = 512
RM_GRADIENT_ACCUMULATION_STEPS = 4

# Reward head architecture
RM_HEAD_TYPE = "mlp"  # "linear" or "mlp"
RM_HIDDEN_DIM = 256  # Only used if mlp

# ===========================
# PPO TRAINING
# ===========================

PPO_LEARNING_RATE = 1.41e-5
PPO_BATCH_SIZE = 8
PPO_MINI_BATCH_SIZE = 2
PPO_GRADIENT_ACCUMULATION_STEPS = 4
PPO_EPOCHS = 1
PPO_STEPS = 1000  # Total optimization steps

# PPO specific parameters
KL_PENALTY_COEF = 0.1  # KL divergence penalty to preserve faithfulness
CLIP_RANGE = 0.2
VALUE_CLIP_RANGE = 0.2
GAE_LAMBDA = 0.95
GAMMA = 0.99

# Generation parameters for PPO
PPO_MAX_NEW_TOKENS = 256
PPO_TEMPERATURE = 0.9

# ===========================
# INFERENCE
# ===========================

INFERENCE_NUM_CANDIDATES = 8  # Generate 5-8 candidates
INFERENCE_TOP_K = 3  # Show top 3 to user
INFERENCE_MAX_LENGTH = 512

# ===========================
# GENERAL SETTINGS
# ===========================

USE_WANDB = True  # Set to True if using Weights & Biases for tracking
WANDB_PROJECT = "rlhf-arabic-translation"
SEED = 42

print("Configuration loaded successfully!")

Configuration loaded successfully!


## Utility Functions

In [8]:
import random
import numpy as np

def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def load_test_prompts(file_path):
    """Load test prompts from JSONL file
    
    Expected format: one JSON object per line
    {"text": "source_text", "lang": "en" or "fr"}
    """
    import json
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            # Skip empty lines and comments
            if not line or line.startswith('//') or line.startswith('#'):
                continue
            try:
                item = json.loads(line)
                data.append({
                    'source': item['text'],
                    'source_lang': item.get('lang', 'en')
                })
            except json.JSONDecodeError as e:
                print(f"Warning: Skipping invalid JSON on line {line_num}: {e}")
                continue
    return data

def format_translation_prompt(text, source_lang='en'):
    """Format input text as translation prompt"""
    lang_name = {'en': 'English', 'fr': 'French'}[source_lang]
    return f"Translate the following {lang_name} text to Arabic:\n\n{text}\n\nArabic translation:"

set_seed(SEED)
print("Utility functions loaded!")

Utility functions loaded!


## Save Configuration

In [7]:
import json

# Save configuration to JSON for reference
config = {
    'sft_model_path': str(SFT_MODEL_PATH),
    'reward_base_model': REWARD_BASE_MODEL,
    'num_candidates': NUM_CANDIDATES,
    'metric_weights': METRIC_WEIGHTS,
    'rm_learning_rate': RM_LEARNING_RATE,
    'rm_batch_size': RM_BATCH_SIZE,
    'rm_epochs': RM_EPOCHS,
    'ppo_learning_rate': PPO_LEARNING_RATE,
    'ppo_steps': PPO_STEPS,
    'kl_penalty_coef': KL_PENALTY_COEF,
    'seed': SEED
}

config_path = PROJECT_DIR / "config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"Configuration saved to {config_path}")

Configuration saved to /home/imane/Downloads/Reinforcement Learning/config.json


## Next Steps

1. **Update the `SFT_MODEL_PATH`** to point to your Gemma-2X289B model location
2. **Prepare your data**: Place EN-AR and FR-AR parallel corpora in the data directory
3. **Run notebook 1**: Generate synthetic preference data
4. **Run notebook 2**: Train the reward model
5. **Run notebook 3**: Run PPO optimization
6. **Run notebook 4**: Test inference and collect user feedback
7. **Run notebook 5**: Fine-tune with human preferences