# Fine-tune BGE-M3 for Premodern Cross-Lingual Entity Matching

This notebook fine-tunes the BGE-M3 multilingual embedding model on training pairs from early modern texts.

**Runtime**: Go to Runtime > Change runtime type > Select T4 GPU

In [None]:
# Install dependencies
!pip install -q sentence-transformers datasets accelerate

In [None]:
# Upload training_pairs.json
from google.colab import files
print("Upload training_pairs.json:")
uploaded = files.upload()

In [None]:
import json
import random
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Load training data
with open('training_pairs.json') as f:
    data = json.load(f)

print(f"Categories: {list(data['categories'].keys())}")
total_pos = sum(len(pairs['positive_pairs']) for pairs in data['categories'].values())
total_neg = sum(len(pairs['hard_negatives']) for pairs in data['categories'].values())
print(f"Total positive pairs: {total_pos}")
print(f"Total hard negatives: {total_neg}")

In [None]:
def create_pairs(data: dict) -> list:
    """Create positive pairs for MultipleNegativesRankingLoss."""
    examples = []
    for category, pairs in data['categories'].items():
        positives = pairs['positive_pairs']
        for pair in positives:
            # Positive pair both directions
            examples.append(InputExample(texts=[pair['source'], pair['target']]))
            examples.append(InputExample(texts=[pair['target'], pair['source']]))
    return examples

examples = create_pairs(data)
random.shuffle(examples)
print(f"Created {len(examples)} training examples")

In [None]:
# Load base model (fresh, not from previous fine-tune)
print("Loading base BGE-M3 model...")
model = SentenceTransformer('BAAI/bge-m3')
print("Model loaded!")

In [None]:
# Training configuration
EPOCHS = 3
BATCH_SIZE = 16
WARMUP_STEPS = 100
OUTPUT_DIR = 'finetuned-bge-m3-v2'

# Create data loader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=BATCH_SIZE)

# Use MultipleNegativesRankingLoss - works well with pairs
train_loss = losses.MultipleNegativesRankingLoss(model)

total_steps = len(train_dataloader) * EPOCHS
print(f"Training configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Total steps: {total_steps}")
print(f"  Warmup steps: {WARMUP_STEPS}")

In [None]:
# Train!
print("Starting training...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    output_path=OUTPUT_DIR,
    show_progress_bar=True,
    save_best_model=True,
)
print(f"Training complete! Model saved to {OUTPUT_DIR}")

In [None]:
# Quick evaluation
print("\n--- Evaluation on test pairs ---")
model = SentenceTransformer(OUTPUT_DIR)

test_pairs = [
    # Should be similar (cross-lingual same referent)
    ("Galeno", "Galen", "similar"),
    ("Medicina", "Medicine", "similar"),
    ("água", "water", "similar"),
    ("febre", "fever", "similar"),
    ("Lisboa", "Lisbon", "similar"),
    ("sangue", "blood", "similar"),
    ("Diofcorides", "Dioscorides", "similar"),
    ("alambique", "alembic", "similar"),
    # Should be dissimilar (different referents)
    ("Galeno", "Avicenna", "dissimilar"),
    ("água", "wine", "dissimilar"),
    ("Lisboa", "Paris", "dissimilar"),
    ("febre", "plague", "dissimilar"),
]

print(f"{'Source':<20} {'Target':<20} {'Similarity':>10} {'Expected':>12}")
print("-" * 65)
for source, target, expected in test_pairs:
    emb1 = model.encode(source, normalize_embeddings=True)
    emb2 = model.encode(target, normalize_embeddings=True)
    sim = float(emb1 @ emb2)
    status = "OK" if (expected == "similar" and sim > 0.7) or (expected == "dissimilar" and sim < 0.7) else "CHECK"
    print(f"{source:<20} {target:<20} {sim:>10.3f} {expected:>12} {status}")

In [None]:
# Zip and download the model
import shutil
shutil.make_archive('finetuned-bge-m3-v2', 'zip', OUTPUT_DIR)
files.download('finetuned-bge-m3-v2.zip')
print("Download complete! Unzip to models/finetuned-bge-m3-v2 in your project.")