# Fine-tune BGE-M3 for Premodern Concordance

This notebook fine-tunes BGE-M3 on historical entity pairs using Google Colab's free GPU.

**IMPORTANT: Uses proper train/test split to avoid overfitting.**
- 75% of pairs used for training
- 25% held out for evaluation (never seen during training)

**Instructions:**
1. Go to Runtime → Change runtime type → Select GPU (T4)
2. Upload `combined_pairs.csv` when prompted
3. Run all cells
4. Download the fine-tuned model at the end

In [None]:
# Install dependencies
!pip install -q sentence-transformers pandas scikit-learn

In [None]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
else:
    print("WARNING: No GPU detected. Go to Runtime → Change runtime type → GPU")

In [None]:
# Upload your combined_pairs.csv
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Link type categories
POSITIVE_LINK_TYPES = {"same_referent", "orthographic_variant", "derivation"}
NEGATIVE_LINK_TYPES = {"hard_negative"}
UNCERTAIN_LINK_TYPES = {"conceptual_overlap", "contested_identity"}

# Load data
df = pd.read_csv('combined_pairs.csv')
print(f"Loaded {len(df)} pairs")
print()
print("Distribution by link type:")
print(df['link_type'].value_counts().to_string())

In [None]:
# CRITICAL: Split data BEFORE training to avoid overfitting
TEST_SIZE = 0.25
RANDOM_STATE = 42

train_df, test_df = train_test_split(
    df, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    stratify=df['link_type']  # Ensure each link type is represented in both sets
)

print(f"Training set: {len(train_df)} pairs")
print(f"Test set (HELD OUT): {len(test_df)} pairs")
print()
print("Test set distribution:")
print(test_df['link_type'].value_counts().to_string())

In [None]:
# Prepare training data (ONLY from train_df)
train_examples = []

for _, row in train_df.iterrows():
    if row['link_type'] in POSITIVE_LINK_TYPES:
        train_examples.append(InputExample(texts=[row['term_a'], row['term_b']], label=1.0))
    elif row['link_type'] in NEGATIVE_LINK_TYPES:
        train_examples.append(InputExample(texts=[row['term_a'], row['term_b']], label=0.0))
    elif row['link_type'] in UNCERTAIN_LINK_TYPES:
        label = 0.5 if row['link_type'] == 'conceptual_overlap' else 0.3
        train_examples.append(InputExample(texts=[row['term_a'], row['term_b']], label=label))

# Data augmentation: add swapped pairs
augmented = [InputExample(texts=[ex.texts[1], ex.texts[0]], label=ex.label) for ex in train_examples]
train_examples.extend(augmented)
random.shuffle(train_examples)

print(f"Training examples (with augmentation): {len(train_examples)}")

In [None]:
# Load model
print("Loading BGE-M3 (this may take a minute)...")
model = SentenceTransformer('BAAI/bge-m3')
print("Model loaded!")

In [None]:
# Training configuration
BATCH_SIZE = 16
EPOCHS = 10
LR = 2e-5
WARMUP_STEPS = 10
OUTPUT_PATH = 'finetuned-bge-m3-premodern'

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.CosineSimilarityLoss(model)

steps_per_epoch = len(train_dataloader)
print(f"Batch size: {BATCH_SIZE}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total epochs: {EPOCHS}")
print(f"Total steps: {steps_per_epoch * EPOCHS}")

In [None]:
# Train!
print("Starting training...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    output_path=OUTPUT_PATH,
    optimizer_params={'lr': LR},
    show_progress_bar=True,
)
print("\nTraining complete!")

In [None]:
# Evaluate on HELD-OUT test set
print("="*60)
print("EVALUATION ON HELD-OUT TEST SET")
print("(These pairs were NEVER seen during training)")
print("="*60)

def cosine(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

# Load both models for comparison
original_model = SentenceTransformer('BAAI/bge-m3')
finetuned_model = SentenceTransformer(OUTPUT_PATH)

# Evaluate on test set
results = []
for _, row in test_df.iterrows():
    orig_sim = cosine(original_model.encode(row['term_a']), original_model.encode(row['term_b']))
    ft_sim = cosine(finetuned_model.encode(row['term_a']), finetuned_model.encode(row['term_b']))
    is_match = row['link_type'] != 'hard_negative'
    results.append({
        'term_a': row['term_a'],
        'term_b': row['term_b'],
        'link_type': row['link_type'],
        'original_sim': orig_sim,
        'finetuned_sim': ft_sim,
        'delta': ft_sim - orig_sim,
        'is_match': is_match,
        'orig_pred': orig_sim >= 0.7,
        'ft_pred': ft_sim >= 0.7,
    })

results_df = pd.DataFrame(results)

In [None]:
# Calculate metrics
print("\nOVERALL METRICS (threshold=0.7):")
print("-"*40)

for name, pred_col in [('Original BGE-M3', 'orig_pred'), ('Fine-tuned', 'ft_pred')]:
    y_true = results_df['is_match'].to_numpy()
    y_pred = results_df[pred_col].to_numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    print(f"{name}:")
    print(f"  Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

print("\nPER LINK-TYPE RESULTS:")
print("-"*40)
for link_type in sorted(results_df['link_type'].unique()):
    subset = results_df[results_df['link_type'] == link_type]
    orig_avg = subset['original_sim'].mean()
    ft_avg = subset['finetuned_sim'].mean()
    delta = ft_avg - orig_avg
    orig_recall = (subset['original_sim'] >= 0.7).mean()
    ft_recall = (subset['finetuned_sim'] >= 0.7).mean()
    print(f"{link_type}:")
    print(f"  Avg sim: {orig_avg:.3f} → {ft_avg:.3f} ({delta:+.3f})")
    print(f"  Recall@0.7: {orig_recall:.1%} → {ft_recall:.1%}")

In [None]:
# Show biggest improvements
print("\nTOP 10 IMPROVEMENTS (fine-tuned vs original):")
print("-"*60)
improvements = results_df[results_df['is_match']].nlargest(10, 'delta')
for _, row in improvements.iterrows():
    print(f"{row['term_a']} / {row['term_b']} ({row['link_type']})")
    print(f"  {row['original_sim']:.3f} → {row['finetuned_sim']:.3f} ({row['delta']:+.3f})")

In [None]:
# Check for potential problems: did hard negatives get pushed DOWN?
print("\nHARD NEGATIVE SEPARATION CHECK:")
print("-"*40)
hard_neg = results_df[results_df['link_type'] == 'hard_negative']
same_ref = results_df[results_df['link_type'] == 'same_referent']

if len(hard_neg) > 0 and len(same_ref) > 0:
    orig_gap = same_ref['original_sim'].mean() - hard_neg['original_sim'].mean()
    ft_gap = same_ref['finetuned_sim'].mean() - hard_neg['finetuned_sim'].mean()
    print(f"Gap (same_referent - hard_negative):")
    print(f"  Original: {orig_gap:.3f}")
    print(f"  Fine-tuned: {ft_gap:.3f}")
    if ft_gap > orig_gap:
        print(f"  ✓ Improved separation by {ft_gap - orig_gap:.3f}")
    else:
        print(f"  ⚠ Separation decreased by {orig_gap - ft_gap:.3f}")
else:
    print("Not enough hard_negative or same_referent pairs in test set")

In [None]:
# Save results
results_df.to_csv('test_results.csv', index=False)
print("Saved test results to test_results.csv")

In [None]:
# Download the fine-tuned model
!zip -r finetuned-model.zip {OUTPUT_PATH}
files.download('finetuned-model.zip')
files.download('test_results.csv')