# Fine-tune Qwen3-0.6B for Entity Extraction

This notebook fine-tunes Qwen3-0.6B on early modern text entity extraction using LoRA.

**Requirements:**
- Google Colab with T4 GPU (free tier works)
- Upload `entity_training_data.jsonl` from your local machine

**Time:** ~1-2 hours on T4

## 1. Install Dependencies

In [None]:
%%capture
!pip install unsloth
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade torch
!pip install triton xformers trl peft accelerate bitsandbytes

## 2. Upload Training Data

Upload your `entity_training_data.jsonl` file when prompted.

In [None]:
from google.colab import files
import json

# Upload the training data file
print("Please upload entity_training_data.jsonl")
uploaded = files.upload()

# Load and verify
filename = list(uploaded.keys())[0]
with open(filename, 'r') as f:
    data = [json.loads(line) for line in f]

print(f"\nLoaded {len(data)} training examples")
print(f"\nSample example:")
print(json.dumps(data[0], indent=2)[:500] + "...")

## 3. Load Qwen3-0.6B with Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

# Model configuration
max_seq_length = 4096
dtype = None  # Auto-detect
load_in_4bit = True  # Use 4-bit quantization for memory efficiency

# Load Qwen3-0.6B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-0.6B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded: {model.config._name_or_path}")
print(f"Parameters: {model.num_parameters():,}")

## 4. Configure LoRA

In [None]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=64,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print("LoRA adapters added")
model.print_trainable_parameters()

## 5. Prepare Dataset

In [None]:
from datasets import Dataset
import random

def format_chat(example):
    """Format messages into a single training string."""
    messages = example['messages']
    
    # Build the prompt
    text = ""
    for msg in messages:
        role = msg['role']
        content = msg['content']
        if role == 'system':
            text += f"<|im_start|>system\n{content}<|im_end|>\n"
        elif role == 'user':
            text += f"<|im_start|>user\n{content}<|im_end|>\n"
        elif role == 'assistant':
            text += f"<|im_start|>assistant\n{content}<|im_end|>\n"
    
    return {"text": text}

# Shuffle and split data (90% train, 10% test)
random.seed(42)
shuffled_data = data.copy()
random.shuffle(shuffled_data)

split_idx = int(len(shuffled_data) * 0.9)
train_data = shuffled_data[:split_idx]
test_data = shuffled_data[split_idx:]

print(f"Total examples: {len(data)}")
print(f"Training set: {len(train_data)} examples (90%)")
print(f"Test set: {len(test_data)} examples (10%)")

# Save test set for evaluation after training
with open("test_set.json", "w") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)
print(f"\nTest set saved to test_set.json for post-training evaluation")

# Create training dataset (only train on train_data, NOT full data)
dataset = Dataset.from_list(train_data)
dataset = dataset.map(format_chat)

print(f"\nTraining dataset size: {len(dataset)}")
print(f"\nSample formatted text (first 500 chars):")
print(dataset[0]['text'][:500])

## 6. Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="outputs",
    ),
)

print("Starting training...")
trainer_stats = trainer.train()
print(f"\nTraining complete!")
print(f"Training time: {trainer_stats.metrics['train_runtime']:.0f} seconds")

## 7. Evaluate on Held-Out Test Set

Run the fine-tuned model on examples it has never seen during training.

In [None]:
# Evaluate on held-out test set
FastLanguageModel.for_inference(model)

# Load test set
with open("test_set.json", "r") as f:
    test_examples = json.load(f)

print(f"Evaluating on {len(test_examples)} held-out test examples...\n")

def extract_entities_from_response(response_text):
    """Parse JSON entities from model response."""
    try:
        match = re.search(r'\[.*\]', response_text, re.DOTALL)
        if match:
            return json.loads(match.group())
    except:
        pass
    return []

import re
results = []
for i, example in enumerate(test_examples[:10]):  # Evaluate first 10 for speed
    # Get the passage from user message
    user_msg = example['messages'][1]['content']
    passage = user_msg.replace("Extract all named entities from this passage:\n\n", "")
    
    # Get expected entities
    expected = json.loads(example['messages'][2]['content'])
    
    # Generate prediction
    prompt = f"""<|im_start|>system
You extract named entities from early modern European texts (1500-1800).
Return a JSON array with name, category (PERSON/SUBSTANCE/CONCEPT), and brief context.<|im_end|>
<|im_start|>user
Extract all named entities from this passage:

{passage}<|im_end|>
<|im_start|>assistant
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract predicted entities
    if "<|im_start|>assistant" in response:
        pred_text = response.split("<|im_start|>assistant")[-1].strip()
    else:
        pred_text = response
    
    predicted = extract_entities_from_response(pred_text)
    
    # Calculate metrics
    expected_names = set(e['name'].lower() for e in expected)
    predicted_names = set(e['name'].lower() for e in predicted)
    
    overlap = expected_names & predicted_names
    precision = len(overlap) / len(predicted_names) if predicted_names else 0
    recall = len(overlap) / len(expected_names) if expected_names else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    results.append({
        'expected': len(expected),
        'predicted': len(predicted),
        'overlap': len(overlap),
        'precision': precision,
        'recall': recall,
        'f1': f1
    })
    
    print(f"Example {i+1}: Expected {len(expected)}, Predicted {len(predicted)}, Overlap {len(overlap)}, F1={f1:.2f}")

# Summary
avg_precision = sum(r['precision'] for r in results) / len(results)
avg_recall = sum(r['recall'] for r in results) / len(results)
avg_f1 = sum(r['f1'] for r in results) / len(results)

print(f"\n{'='*50}")
print(f"EVALUATION RESULTS (on {len(results)} held-out examples)")
print(f"{'='*50}")
print(f"Average Precision: {avg_precision:.2%}")
print(f"Average Recall: {avg_recall:.2%}")
print(f"Average F1 Score: {avg_f1:.2%}")

## 8. Save the Model

In [None]:
# Save LoRA adapters
model.save_pretrained("qwen3-0.6b-entities-lora")
tokenizer.save_pretrained("qwen3-0.6b-entities-lora")

print("LoRA adapters saved to qwen3-0.6b-entities-lora/")

In [None]:
# Option A: Save merged model (larger but easier to use)
model.save_pretrained_merged(
    "qwen3-0.6b-entities-merged",
    tokenizer,
    save_method="merged_16bit",
)
print("Merged model saved to qwen3-0.6b-entities-merged/")

In [None]:
# Option B: Save as GGUF for ollama (recommended for local use)
model.save_pretrained_gguf(
    "qwen3-0.6b-entities-gguf",
    tokenizer,
    quantization_method="q4_k_m",  # Good balance of size/quality
)
print("GGUF model saved to qwen3-0.6b-entities-gguf/")

## 9. Download the Model

In [None]:
# Zip and download the GGUF model for use with ollama
!zip -r qwen3-0.6b-entities-gguf.zip qwen3-0.6b-entities-gguf/

from google.colab import files
files.download('qwen3-0.6b-entities-gguf.zip')

print("\nDownload complete! To use with ollama:")
print("1. Unzip the file")
print("2. Create a Modelfile with: FROM ./qwen3-0.6b-entities-gguf/unsloth.Q4_K_M.gguf")
print("3. Run: ollama create qwen3-entities -f Modelfile")

## Done!

You now have a fine-tuned Qwen3-0.6B model optimized for extracting entities from early modern texts.

**Next steps:**
1. Download the GGUF file
2. Import into ollama
3. Test against Gemini 2.5 Flash Lite and GPT-5 Nano
4. Run on full Semedo text