# Translation Fine-tuning from Base Model

This notebook trains a translation model from scratch, starting with the base Qwen2.5 model (not the proofreading checkpoint).

**Strategy:**
- Start from base Qwen2.5-8B-Instruct
- Train on translation data only
- Focus solely on gender-sensitive PL⇄EN translation
- This is a single-task model (translation only, no proofreading)

### Configuration

In [None]:
import os

# Base model - same as proofreading training
BASE_MODEL = "unsloth/Qwen3-8B-unsloth-bnb-4bit"
MODEL_SIZE = "8B"

# Training configuration
MAX_SEQ_LENGTH = 4096

# LoRA configuration - same as proofreading
LORA_RANK = 64
LORA_ALPHA = 64
LORA_DROPOUT = 0

# Training hyperparameters - same as proofreading
LEARNING_RATE = 2e-4
EPOCHS = 2
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 2  # Effective batch size = 2
WARMUP_STEPS = 10
WEIGHT_DECAY = 0.01
SEED = 3407

# Generation parameters for inference
REPETITION_PENALTY = 1.1
NO_REPEAT_NGRAM_SIZE = 3

# Output directory
OUTPUT_DIR = f"../../../outputs/qwen3_{MODEL_SIZE}_translation_standalone_lora_r{LORA_RANK}_lr{LEARNING_RATE}_ep{EPOCHS}_bs{BATCH_SIZE}_ga{GRADIENT_ACCUMULATION_STEPS}_warmup{WARMUP_STEPS}_seq{MAX_SEQ_LENGTH}"
OUTPUT_DIR = os.path.abspath(OUTPUT_DIR)

# Data paths
TRAIN_DATA_PATH = "../../../data/taskB/train.jsonl"

print(f"Base model: {BASE_MODEL}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Training data: {TRAIN_DATA_PATH}")
print(f"\nKey settings:")
print(f"  - Learning rate: {LEARNING_RATE}")
print(f"  - Epochs: {EPOCHS}")
print(f"  - Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  - Repetition penalty: {REPETITION_PENALTY}")
print(f"  - No repeat ngram size: {NO_REPEAT_NGRAM_SIZE}")

### Setup Environment

In [None]:
# Set cache directories
import os
os.environ['HF_HOME'] = '/home/adam/Downloads/poleval-gender-new/.cache/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/home/adam/Downloads/poleval-gender-new/.cache/huggingface/transformers'
os.environ['HF_DATASETS_CACHE'] = '/home/adam/Downloads/poleval-gender-new/.cache/huggingface/datasets'
os.environ['TRITON_CACHE_DIR'] = '/home/adam/Downloads/poleval-gender-new/.cache/triton'
    
import warnings
warnings.filterwarnings('ignore')

### Load Base Model

In [None]:
from unsloth import FastLanguageModel
import torch

print(f"Loading base model: {BASE_MODEL}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,  # Auto-detect
    load_in_4bit = True,
)

print("Base model loaded!")
print(f"Model type: {type(model).__name__}")

### Add LoRA Adapters

In [None]:
# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_RANK,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

print("✓ LoRA adapters added!")
print(f"LoRA rank: {LORA_RANK}")
print(f"LoRA alpha: {LORA_ALPHA}")

### Load Translation Training Data

In [None]:
import json
from datasets import Dataset

def load_translation_data(file_path):
    """Load translation training data."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            data.append({
                'prompt': item['prompt'],
                'source': item['source'],
                'target': item['target'],
                'prompt_language': item['prompt_language'],
                'source_language': item['source_language'],
                'target_language': item['target_language']
            })
    return data

# Load training data
train_data = load_translation_data(TRAIN_DATA_PATH)

print(f"Loaded {len(train_data)} training examples")
print(f"\nFirst example:")
print(f"  Direction: {train_data[0]['source_language']} → {train_data[0]['target_language']}")
print(f"  Prompt: {train_data[0]['prompt'][:80]}...")
print(f"  Source: {train_data[0]['source'][:80]}...")
print(f"  Target: {train_data[0]['target'][:80]}...")

### Load System Prompts

In [None]:
# Load translation system prompts
with open('../../../system_prompts/translation/system_prompt_en_translation', 'r', encoding='utf-8') as f:
    SYSTEM_PROMPT_EN = f.read().strip()

with open('../../../system_prompts/translation/system_prompt_pl_translation', 'r', encoding='utf-8') as f:
    SYSTEM_PROMPT_PL = f.read().strip()

print("System prompts loaded.")
print(f"English prompt: {len(SYSTEM_PROMPT_EN)} chars")
print(f"Polish prompt: {len(SYSTEM_PROMPT_PL)} chars")

### Prepare Dataset for Training

In [None]:
def format_translation_prompt(example):
    """Format a translation example with the appropriate system prompt."""
    # Select system prompt based on prompt language
    system_prompt = SYSTEM_PROMPT_EN if example['prompt_language'] == 'EN' else SYSTEM_PROMPT_PL
    
    # Construct user message
    user_message = example['prompt'] + example['source']
    
    # Format as chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": example['target']}
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    return {"text": text}

# Convert to HuggingFace dataset and format
train_dataset = Dataset.from_list(train_data)
train_dataset = train_dataset.map(format_translation_prompt, remove_columns=train_dataset.column_names)

print(f"Formatted {len(train_dataset)} training examples")
print(f"\nExample formatted text (first 500 chars):")
print(train_dataset[0]['text'][:500] + "...")

### Setup Training Arguments

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    seed=42,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=False,
    report_to="none",
)

print("Training arguments configured:")
print(f"  - Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  - Learning rate: {LEARNING_RATE}")
print(f"  - Total epochs: {EPOCHS}")
print(f"  - Estimated steps: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * EPOCHS}")

### Initialize Trainer

In [None]:
from unsloth import UnslothTrainer, UnslothTrainingArguments
from transformers import DataCollatorForLanguageModeling

trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    args=UnslothTrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=WARMUP_STEPS,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        seed=42,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        load_best_model_at_end=False,
        report_to="mlflow",
    ),
)

print("✓ Trainer initialized")

### Start Training

This will train a translation-only model from the base Qwen model.

In [None]:
print("="*80)
print("STARTING TRANSLATION TRAINING (STANDALONE)")
print("="*80)
print(f"Base model: {BASE_MODEL}")
print(f"Task: Translation (PL⇄EN)")
print(f"Strategy: Full training, {EPOCHS} epochs")
print("="*80)
print()

trainer_stats = trainer.train()

print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)

### Save Final Model

In [None]:
# Save the final model
final_model_path = os.path.join(OUTPUT_DIR, "final_model")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Save generation parameters to a config file
import json
generation_config = {
    "repetition_penalty": REPETITION_PENALTY,
    "no_repeat_ngram_size": NO_REPEAT_NGRAM_SIZE,
    "temperature": 0.3,
    "top_p": 0.9,
    "top_k": 50,
    "max_new_tokens": 4096
}

with open(os.path.join(final_model_path, "generation_config_recommended.json"), 'w') as f:
    json.dump(generation_config, f, indent=2)

print(f"Model saved to: {final_model_path}")
print(f"Generation config saved")
print(f"\nThis model is specialized for:")
print(f"  - Gender-sensitive Polish ⇄ English translation")
print(f"\nRecommended inference parameters:")
print(f"  - repetition_penalty: {REPETITION_PENALTY}")
print(f"  - no_repeat_ngram_size: {NO_REPEAT_NGRAM_SIZE}")

### Training Statistics

In [None]:
print("TRAINING STATISTICS")
print("="*80)
print(f"Total training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training samples/second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
print(f"Final loss: {trainer_stats.metrics['train_loss']:.4f}")
print("="*80)

### Test the Model

Let's quickly test the model on a few examples to verify it works.

In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model)

# Test on a couple of examples
test_examples = [
    {
        "prompt": "Translate the following text from Polish to English:\n",
        "source": "Wszyscy studenci i studentki muszą zdać egzamin.",
        "prompt_language": "EN",
        "source_language": "PL",
        "target_language": "EN"
    },
    {
        "prompt": "Przetłumacz poniższy tekst z angielskiego na polski:\n",
        "source": "All the teachers must attend the meeting.",
        "prompt_language": "PL",
        "source_language": "EN",
        "target_language": "PL"
    }
]

print("="*80)
print("TESTING MODEL ON SAMPLE TRANSLATIONS")
print("="*80)

for i, example in enumerate(test_examples):
    print(f"\n{'='*80}")
    print(f"Test {i+1}: {example['source_language']} → {example['target_language']}")
    print(f"{'='*80}")
    
    # Select system prompt
    system_prompt = SYSTEM_PROMPT_EN if example['prompt_language'] == 'EN' else SYSTEM_PROMPT_PL
    user_message = example['prompt'] + example['source']
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize and generate
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.3,
            top_p=0.9,
            top_k=50,
            do_sample=True,
            repetition_penalty=REPETITION_PENALTY,
            no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode response
    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    # Clean up artifacts
    if "<|im_end|>" in response:
        response = response.split("<|im_end|>")[0]
    if "<|im_start|>" in response:
        response = response.split("<|im_start|>")[-1]
        if "\n" in response:
            response = response.split("\n", 1)[-1]
    
    print(f"\nSource: {example['source']}")
    print(f"\nTranslation: {response.strip()}")

print(f"\n{'='*80}")
print("TEST COMPLETE")
print("="*80)