# Fine-Tune BGE-M3 for Premodern Concordance (v3)

This notebook fine-tunes the BGE-M3 multilingual embedding model on 889 curated
cross-lingual entity matching pairs + 154 hard negatives.

**How to use:**
1. Make sure GPU is enabled: Runtime → Change runtime type → T4 GPU
2. Run each cell in order (Shift+Enter or click the play button)
3. Cell 2 will ask you to upload a file — upload `curated_training_pairs.json`
4. Training takes ~10-20 minutes
5. The last cell downloads the fine-tuned model as a zip file

In [None]:
# ── Cell 1: Install dependencies ──────────────────────────────────────────────
# This installs the libraries needed for fine-tuning. Takes ~1-2 minutes.

!pip install -q sentence-transformers torch

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: No GPU detected! Go to Runtime → Change runtime type → T4 GPU")

In [None]:
# ── Cell 2: Upload training data ──────────────────────────────────────────────
# Click 'Choose Files' and select: data/curated_training_pairs.json

from google.colab import files
import json

print("Please upload curated_training_pairs.json when prompted...")
uploaded = files.upload()

# Load and verify
filename = list(uploaded.keys())[0]
with open(filename) as f:
    data = json.load(f)

print(f"\nLoaded: {filename}")
print(f"Batches: {len(data['batches'])}")
total_pos = sum(len(b['positive_pairs']) for b in data['batches'])
total_neg = sum(len(b['hard_negatives']) for b in data['batches'])
print(f"Total: {total_pos} positive pairs, {total_neg} hard negatives")
for b in data['batches']:
    print(f"  {b['batch_id']}: {len(b['positive_pairs'])} pos, {len(b['hard_negatives'])} neg")

In [None]:
# ── Cell 3: Load base model ───────────────────────────────────────────────────
# Downloads BGE-M3 from Hugging Face (~2.2 GB). Takes ~2-5 minutes.

from sentence_transformers import SentenceTransformer

print("Downloading BGE-M3 base model from Hugging Face...")
print("(This is ~2.2 GB, may take a few minutes)")
model = SentenceTransformer("BAAI/bge-m3")
print(f"Model loaded! Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# ── Cell 4: Prepare training data ─────────────────────────────────────────────

import random
from sentence_transformers import InputExample

# Flatten batches with deduplication
all_positives = []
all_negatives = []
seen_pos = set()
seen_neg = set()

for batch in data['batches']:
    for p in batch['positive_pairs']:
        key = (p['source'].lower().strip(), p['target'].lower().strip())
        rkey = (key[1], key[0])
        if key not in seen_pos and rkey not in seen_pos:
            seen_pos.add(key)
            all_positives.append(p)
    for n in batch['hard_negatives']:
        key = (n['source'].lower().strip(), n['target'].lower().strip())
        if key not in seen_neg:
            seen_neg.add(key)
            all_negatives.append(n)

print(f"Unique positives: {len(all_positives)}")
print(f"Unique negatives: {len(all_negatives)}")

# Create training examples (both directions for each positive pair)
examples = []
for pair in all_positives:
    examples.append(InputExample(texts=[pair['source'], pair['target']]))
    examples.append(InputExample(texts=[pair['target'], pair['source']]))

random.shuffle(examples)
print(f"Training examples: {len(examples)} (each pair in both directions)")

In [None]:
# ── Cell 5: Baseline evaluation (BEFORE training) ─────────────────────────────
# See how the base model performs on our pairs before we fine-tune it.

def evaluate(model, positives, negatives, label=""):
    """Evaluate model on sample of positives and negatives."""
    eval_pos = random.sample(positives, min(80, len(positives)))
    eval_neg = negatives[:80]

    pos_sims = []
    for p in eval_pos:
        e1 = model.encode(p['source'], normalize_embeddings=True)
        e2 = model.encode(p['target'], normalize_embeddings=True)
        pos_sims.append(float(e1 @ e2))

    neg_sims = []
    for n in eval_neg:
        e1 = model.encode(n['source'], normalize_embeddings=True)
        e2 = model.encode(n['target'], normalize_embeddings=True)
        neg_sims.append(float(e1 @ e2))

    avg_pos = sum(pos_sims) / len(pos_sims)
    avg_neg = sum(neg_sims) / len(neg_sims)
    sep = avg_pos - avg_neg

    print(f"\n{'='*50}")
    print(f"{label}")
    print(f"{'='*50}")
    print(f"  Avg positive similarity: {avg_pos:.4f} (n={len(pos_sims)})")
    print(f"  Avg negative similarity: {avg_neg:.4f} (n={len(neg_sims)})")
    print(f"  Separation:              {sep:.4f}")

    # Spot checks
    checks = [
        ('canela', 'cinnamon', True),
        ('febre', 'fever', True),
        ('mesmerism', 'hypnosis', True),
        ('Falling sickness', 'epilepsy', True),
        ('vibratiuncles', 'memory traces', True),
        ('unbewusster Schluss', 'unconscious inference', True),
        ('désagrégation', 'dissociation', True),
        ('Galeno', 'Avicenna', False),
        ('canfora', 'canela', False),
        ('phrenology', 'phenology', False),
        ('caloric', 'calorie', False),
        ('hystérie', 'hystérèse', False),
    ]
    print(f"\n  Spot checks:")
    for src, tgt, should_match in checks:
        e1 = model.encode(src, normalize_embeddings=True)
        e2 = model.encode(tgt, normalize_embeddings=True)
        sim = float(e1 @ e2)
        ok = (sim > 0.5) == should_match
        mark = 'GOOD' if ok else 'WARN'
        expect = 'match' if should_match else 'no match'
        print(f"    [{mark}] {src:30} ↔ {tgt:25} {sim:.3f}  ({expect})")

    return avg_pos, avg_neg, sep

baseline = evaluate(model, all_positives, all_negatives, "BASELINE (before training)")

In [None]:
# ── Cell 6: Fine-tune! ────────────────────────────────────────────────────────
# This is the main training step. ~10-20 minutes on a T4 GPU.

from sentence_transformers import losses
from torch.utils.data import DataLoader

EPOCHS = 3
BATCH_SIZE = 16
WARMUP_STEPS = 100
LEARNING_RATE = 2e-5
OUTPUT_DIR = "finetuned-bge-m3-v3"

train_dataloader = DataLoader(examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.MultipleNegativesRankingLoss(model)

total_steps = len(train_dataloader) * EPOCHS
print(f"Training config:")
print(f"  Examples:     {len(examples)}")
print(f"  Batch size:   {BATCH_SIZE}")
print(f"  Epochs:       {EPOCHS}")
print(f"  Steps/epoch:  {len(train_dataloader)}")
print(f"  Total steps:  {total_steps}")
print(f"  Warmup:       {WARMUP_STEPS}")
print(f"  LR:           {LEARNING_RATE}")
print(f"  Loss:         MultipleNegativesRankingLoss")
print(f"\nStarting training...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    output_path=OUTPUT_DIR,
    show_progress_bar=True,
    save_best_model=True,
    optimizer_params={"lr": LEARNING_RATE},
)

print(f"\nTraining complete! Model saved to {OUTPUT_DIR}/")

In [None]:
# ── Cell 7: Post-training evaluation ──────────────────────────────────────────
# Load the saved model and compare to baseline.

model_v3 = SentenceTransformer("finetuned-bge-m3-v3")
result = evaluate(model_v3, all_positives, all_negatives, "POST-TRAINING (after fine-tuning)")

print(f"\n{'='*50}")
print(f"IMPROVEMENT SUMMARY")
print(f"{'='*50}")
print(f"  Positive avg: {baseline[0]:.4f} → {result[0]:.4f} (Δ {result[0]-baseline[0]:+.4f})")
print(f"  Negative avg: {baseline[1]:.4f} → {result[1]:.4f} (Δ {result[1]-baseline[1]:+.4f})")
print(f"  Separation:   {baseline[2]:.4f} → {result[2]:.4f} (Δ {result[2]-baseline[2]:+.4f})")

In [None]:
# ── Cell 8: Download the fine-tuned model ─────────────────────────────────────
# Zips the model (~2.2 GB) and triggers a browser download.
# If the download doesn't start, look for the file in the left sidebar
# (folder icon) and right-click → Download.

import shutil

print("Zipping model... (this takes a minute)")
shutil.make_archive("finetuned-bge-m3-v3", "zip", ".", "finetuned-bge-m3-v3")
print("Done! Starting download...")

files.download("finetuned-bge-m3-v3.zip")
print("\nAfter downloading, unzip into your project:")
print("  unzip finetuned-bge-m3-v3.zip -d /path/to/Premodern\\ Concordance/models/")