In [1]:
from datasets import load_dataset
from keybert import KeyBERT
import re
from typing import List, Tuple
import numpy as np
from tqdm.notebook import tqdm
import time
import pickle
from datetime import datetime


In [2]:
# Load SemEval dataset
sem_eval_ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# Load Inspec dataset
inspec_ds = load_dataset("midas/inspec", "generation")


In [3]:
from sentence_transformers import SentenceTransformer

def clean_sem_eval_sentence(text: str) -> str:
    """Remove entity XML tags from SemEval sentences."""
    return re.sub(r'</?e[12]>', '', text).strip()

def prepare_inspec_text(tokens: List[str]) -> str:
    """Convert list of tokens to clean string for Inspec."""
    return ' '.join([t for t in tokens if not (t.startswith('-') and t.endswith('-'))])

def extract_keyphrases(text: str, model: KeyBERT, top_n: int = 8) -> List[str]:
    """Extract keyphrases using KeyBERT with maxsum."""
    keyphrases = model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_n,
        use_maxsum=True
    )
    return [kp[0] for kp in keyphrases]

# Initialize KeyBERT with a stronger embedding model
print("Loading KeyBERT with all-mpnet-base-v2...")
embedding_model = SentenceTransformer("all-mpnet-base-v2")
model = KeyBERT(embedding_model)


Loading KeyBERT with all-mpnet-base-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def evaluate_matches(true_phrases: List[str], extracted_phrases: List[str], partial_match: bool = True) -> Tuple[float, float, float]:
    """Calculate precision, recall, and F1 score with optional partial matching."""
    if partial_match:
        matched_true = set()
        matched_extracted = set()
        for i, ext in enumerate(extracted_phrases):
            for j, true in enumerate(true_phrases):
                if ext.lower() in true.lower() or true.lower() in ext.lower():
                    matched_extracted.add(i)
                    matched_true.add(j)
        matches = len(matched_true)
    else:
        matches = sum(1 for ext in extracted_phrases if any(ext.lower() == true.lower() for true in true_phrases))

    precision = matches / len(extracted_phrases) if extracted_phrases else 0
    recall = matches / len(true_phrases) if true_phrases else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1


In [5]:
def process_sem_eval(dataset, model, top_n=2):
    results = []
    metrics = []
    for sample in tqdm(dataset):
        sentence = sample['sentence']
        true_entities = re.findall(r'<e[12]>(.*?)</e[12]>', sentence)
        clean_text = clean_sem_eval_sentence(sentence)
        extracted = extract_keyphrases(clean_text, model, top_n=top_n)

        precision, recall, f1 = evaluate_matches(true_entities, extracted)
        metrics.append((precision, recall, f1))

        results.append({
            'sentence': sentence,
            'true_entities': true_entities,
            'extracted_phrases': extracted,
            'metrics': {'precision': precision, 'recall': recall, 'f1': f1}
        })
    avg_metrics = {
        'precision': np.mean([m[0] for m in metrics]),
        'recall': np.mean([m[1] for m in metrics]),
        'f1': np.mean([m[2] for m in metrics]),
    }
    return results, avg_metrics


In [6]:
def process_inspec(dataset, model, top_n=8):
    results = []
    metrics = []
    for sample in tqdm(dataset):
        text = prepare_inspec_text(sample['document'])
        true_keyphrases = sample['extractive_keyphrases']
        extracted = extract_keyphrases(text, model, top_n=top_n)

        precision, recall, f1 = evaluate_matches(true_keyphrases, extracted)
        metrics.append((precision, recall, f1))

        results.append({
            'document': sample['document'],
            'true_keyphrases': true_keyphrases,
            'extracted_phrases': extracted,
            'metrics': {'precision': precision, 'recall': recall, 'f1': f1}
        })
    avg_metrics = {
        'precision': np.mean([m[0] for m in metrics]),
        'recall': np.mean([m[1] for m in metrics]),
        'f1': np.mean([m[2] for m in metrics]),
    }
    return results, avg_metrics


In [7]:
print("Processing SemEval-2010 train dataset...")
sem_eval_train_results, sem_eval_train_metrics = process_sem_eval(sem_eval_ds['train'], model, top_n=2)

print("Processing SemEval-2010 test dataset...")
sem_eval_test_results, sem_eval_test_metrics = process_sem_eval(sem_eval_ds['test'], model, top_n=2)

print("\nSemEval-2010 Metrics:")
print(f"Train - Precision: {sem_eval_train_metrics['precision']:.3f}, Recall: {sem_eval_train_metrics['recall']:.3f}, F1: {sem_eval_train_metrics['f1']:.3f}")
print(f"Test  - Precision: {sem_eval_test_metrics['precision']:.3f}, Recall: {sem_eval_test_metrics['recall']:.3f}, F1: {sem_eval_test_metrics['f1']:.3f}")


Processing SemEval-2010 train dataset...


  0%|          | 0/8000 [00:00<?, ?it/s]

Processing SemEval-2010 test dataset...


  0%|          | 0/2717 [00:00<?, ?it/s]


SemEval-2010 Metrics:
Train - Precision: 0.524, Recall: 0.524, F1: 0.524
Test  - Precision: 0.529, Recall: 0.529, F1: 0.529


In [8]:
print("Processing Inspec train dataset...")
inspec_train_results, inspec_train_metrics = process_inspec(inspec_ds['train'], model, top_n=8)

print("Processing Inspec test dataset...")
inspec_test_results, inspec_test_metrics = process_inspec(inspec_ds['test'], model, top_n=8)

print("\nInspec Metrics:")
print(f"Train - Precision: {inspec_train_metrics['precision']:.3f}, Recall: {inspec_train_metrics['recall']:.3f}, F1: {inspec_train_metrics['f1']:.3f}")
print(f"Test  - Precision: {inspec_test_metrics['precision']:.3f}, Recall: {inspec_test_metrics['recall']:.3f}, F1: {inspec_test_metrics['f1']:.3f}")


Processing Inspec train dataset...


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing Inspec test dataset...


  0%|          | 0/500 [00:00<?, ?it/s]


Inspec Metrics:
Train - Precision: 0.284, Recall: 0.422, F1: 0.313
Test  - Precision: 0.295, Recall: 0.425, F1: 0.321
