In [1]:
# =========================
# Imports and Setup
# =========================
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, util, InputExample, losses, CrossEncoder
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
os.environ["WANDB_DISABLED"] = "true"

# =========================
# Data Loading and Splitting
# =========================
df = pd.read_csv("train.csv")

# FAST PROTOTYPE: Reduce subset to 10k for quick run
subset_size = 10000
df_subset, _ = train_test_split(
    df,
    train_size=subset_size,
    stratify=df["is_duplicate"],
    random_state=42
)

train, temp = train_test_split(
    df_subset,
    test_size=0.2,
    stratify=df_subset["is_duplicate"],
    random_state=42
)
valid, test = train_test_split(
    temp,
    test_size=0.5,
    stratify=temp["is_duplicate"],
    random_state=42
)

os.makedirs("splits", exist_ok=True)
train.to_csv("splits/train.csv", index=False)
valid.to_csv("splits/valid.csv", index=False)
test.to_csv("splits/test.csv", index=False)

print("Final sizes:", len(train), len(valid), len(test))

# =========================
# Baseline Model (Pre-trained Bi-Encoder)
# =========================
test = pd.read_csv("splits/test.csv")

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

emb1 = model.encode(test["question1"].tolist(), batch_size=128, convert_to_numpy=True)
emb2 = model.encode(test["question2"].tolist(), batch_size=128, convert_to_numpy=True)

sims = util.cos_sim(emb1, emb2).diagonal()

best_f1, best_thr = 0, 0
for thr in [i/100 for i in range(-100, 101)]:
    sims_np = sims.cpu().numpy()
    preds = (sims_np >= thr).astype(int)
    f1 = f1_score(test["is_duplicate"], preds)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"[Baseline] Test F1={best_f1:.4f} at threshold={best_thr:.2f}")

# =========================
# Fine-tuned Bi-Encoder (Cosine Similarity, 1 epoch, batch 64)
# =========================
train = pd.read_csv("splits/train.csv")
valid = pd.read_csv("splits/valid.csv")
test = pd.read_csv("splits/test.csv")

def to_examples(df):
    return [InputExample(texts=[row["question1"], row["question2"]], label=float(row["is_duplicate"])) for _, row in df.iterrows()]

train_examples = to_examples(train)
valid_examples = to_examples(valid)
test_examples = to_examples(test)

def run_biencoder(loss_type, base_model, epochs=1, batch_size=64, lr=2e-5):
    model = SentenceTransformer(base_model)
    # Cosine Loss (labels: +1/-1)
    for ex in train_examples:
        ex.label = 1.0 if ex.label == 1.0 else -1.0
    train_loss = losses.CosineSimilarityLoss(model)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=100,
        optimizer_params={'lr': lr},
        show_progress_bar=True
    )
    def evaluate(model, examples, lt):
        q1 = [ex.texts for ex in examples]
        q2 = [ex.texts[1] for ex in examples]
        labels = [1 if ex.label == 1.0 else 0 for ex in examples]
        emb1 = model.encode(q1, batch_size=128, convert_to_numpy=True)
        emb2 = model.encode(q2, batch_size=128, convert_to_numpy=True)
        sims = util.cos_sim(emb1, emb2).diagonal().cpu().numpy()
        best_f1, best_thr = 0, 0
        for thr in [i/100 for i in range(-100, 101)]:
            preds = (sims >= thr).astype(int)
            f1 = f1_score(labels, preds)
            if f1 > best_f1:
                best_f1, best_thr = f1, thr
        return best_f1, best_thr
    val_f1, thr = evaluate(model, valid_examples, loss_type)
    test_f1, _ = evaluate(model, test_examples, loss_type)
    print(f"[Bi-encoder Cosine] Validation F1={val_f1:.4f} | Test F1={test_f1:.4f} at threshold={thr:.2f}")
    return model, test_f1

cos_model, cos_f1 = run_biencoder("cos", "sentence-transformers/all-MiniLM-L6-v2")

# =========================
# Cross-Encoder (MiniLM-L-6-v2, 1 epoch, batch 64)
# =========================
train_samples = [
    InputExample(texts=[row["question1"], row["question2"]], label=float(row["is_duplicate"]))
    for _, row in train.iterrows()
]
valid_samples = [
    (row["question1"], row["question2"], int(row["is_duplicate"]))
    for _, row in valid.iterrows()
]
test_samples = [
    (row["question1"], row["question2"], int(row["is_duplicate"]))
    for _, row in test.iterrows()
]

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=64)

ce_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

ce_model.fit(
    train_dataloader=train_dataloader,
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)

def evaluate_ce(model, samples):
    texts = [(q1, q2) for q1, q2, _ in samples]
    labels = [lbl for _, _, lbl in samples]
    scores = model.predict(texts)
    best_f1, best_thr = 0, 0
    for thr in np.linspace(0, 1, 101):
        preds = (scores >= thr).astype(int)
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_f1, best_thr

val_f1, thr = evaluate_ce(ce_model, valid_samples)
test_f1, _ = evaluate_ce(ce_model, test_samples)

print(f"[CrossEncoder] Validation F1={val_f1:.4f} | Test F1={test_f1:.4f} at threshold={thr:.2f}")

# =========================
# F1 Score Comparison
# =========================
results = {
    "Baseline": best_f1,
    "Bi-encoder (Cosine)": cos_f1,
    "Cross-encoder": test_f1
}
print(results)


Final sizes: 8000 1000 1000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[Baseline] Test F1=0.7517 at threshold=0.76


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


[Bi-encoder Cosine] Validation F1=0.6009 | Test F1=0.5789 at threshold=0.55


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


[CrossEncoder] Validation F1=0.7252 | Test F1=0.7415 at threshold=0.34
{'Baseline': 0.7517401392111369, 'Bi-encoder (Cosine)': 0.578853046594982, 'Cross-encoder': 0.7414634146341463}
