# Load a spacy model with in-words hyphens fix

In [1]:
import spacy
from spacy.tokenizer import _get_regex_pattern

nlp = spacy.load("en_core_web_sm", disable=['ner', 'textcat', 'parser'])
nlp.add_pipe("sentencizer")

# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept 
# as one token instead of default spacy behavior of 'non', '-', 'linear')
# https://spacy.io/usage/linguistic-features#native-tokenizer-additions

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

# Pre-process train and test splits using spacy and serialize docbins

In [2]:
import os
from tqdm.notebook import tqdm
from datasets import load_dataset
from spacy.tokens import DocBin

# load the inspec dataset
benchmark = "semeval-2010-pre"
dataset = load_dataset('taln-ls2n/semeval-2010-pre')

# pre-process training and test splits
for split in ['train', 'test']:
    output_file = "data/{}.{}.docbin".format(benchmark, split)
    if os.path.exists(output_file):
        continue
    doc_bin = DocBin()
    for sample in tqdm(dataset[split]):
        doc = nlp(sample["title"]+". "+sample["abstract"])
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()
    with open(output_file, 'wb') as o:
        o.write(bytes_data)
    del doc_bin

No config specified, defaulting to: sem_eval/raw


Downloading and preparing dataset sem_eval/raw to /Users/boudinfl/.cache/huggingface/datasets/taln-ls2n___sem_eval/raw/1.0.0/b40e008b5c96137733e24d9d244d70aa1fe6353ee65e180d8f6948af4027fbe4...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset sem_eval downloaded and prepared to /Users/boudinfl/.cache/huggingface/datasets/taln-ls2n___sem_eval/raw/1.0.0/b40e008b5c96137733e24d9d244d70aa1fe6353ee65e180d8f6948af4027fbe4. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

# Compute DF counts (TfIdf) and LDA model (TopicalPageRank)

In [3]:
import os
from pke import compute_document_frequency, compute_lda_model
from string import punctuation

df_file = "data/{}.df.gz".format(benchmark)
train_file = "data/{}.train.docbin".format(benchmark)
train = []

if not os.path.exists(df_file):
    if not len(train):
        with open(train_file, 'rb') as f: 
            doc_bin = DocBin().from_bytes(f.read())
            train = list(doc_bin.get_docs(nlp.vocab))
    
    compute_document_frequency(
        documents=train,
        output_file=df_file, 
        language='en',                   # language of the input files
        normalization='stemming',        # use porter stemmer
        stoplist=list(punctuation),      # stoplist (punctuation marks)
        n=5                              # compute n-grams up to 5-grams
    )

lda_file = "data/{}.lda.pickle.gz".format(benchmark)
if not os.path.exists(lda_file):
    if not len(train):
        with open(train_file, 'rb') as f: 
            doc_bin = DocBin().from_bytes(f.read())
            train = list(doc_bin.get_docs(nlp.vocab))
            
    compute_lda_model(
        documents=train,
        output_file=lda_file,
        n_topics=1000,              # number of topics
        language='en',              # language of the input files
        stoplist=list(punctuation), # stoplist (punctuation marks)
        normalization='stemming'    # use porter stemmer
    )



# Load DF counts and LDA model

In [4]:
from pke import load_document_frequency_file, load_lda_model

df = load_document_frequency_file(input_file=df_file)
lda_model = load_lda_model(input_file=lda_file)

In [5]:
from pke.unsupervised import *
from timeit import default_timer as timer

test_file = "data/{}.test.docbin".format(benchmark)
with open(test_file, 'rb') as f: 
    doc_bin = DocBin().from_bytes(f.read())
    test = list(doc_bin.get_docs(nlp.vocab))

outputs = {}
elapsed_times = {}
for model in [FirstPhrases, TextRank, SingleRank, TopicRank, PositionRank, MultipartiteRank, TfIdf, TopicalPageRank]:
    outputs[model.__name__] = []
    
    extractor = model()
    start = timer()
    for i, doc in enumerate(tqdm(test)):
        extractor.load_document(input=doc, language='en')
        extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
        if model.__name__ in ["TfIdf"]:
            extractor.candidate_weighting(df=df)
        elif model.__name__ in ["TopicalPageRank"]:
            extractor.candidate_weighting(lda_model=lda_model)
        else:
            extractor.candidate_weighting()
        outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=10, stemming=True)])
    end = timer()
    elapsed_times[model.__name__] = end - start

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]



In [6]:
from nltk.stem.snowball import SnowballStemmer as Stemmer
import numpy as np
    
# populates the references list with stemmed keyphrases
references = []
for sample in tqdm(dataset['test']):
    sample_keyphrases = []
    for keyphrase in sample["keyphrases"]:
        # tokenize keyphrase
        tokens = [token.text for token in nlp(keyphrase)]
        # normalize tokens using Porter's stemming
        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]
        sample_keyphrases.append(" ".join(stems))
    references.append(sample_keyphrases)

def evaluate(top_N_keyphrases, references, cutoff=5):
    P = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(top_N_keyphrases[:cutoff])
    R = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(references)
    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 
    return (P, R, F)

  0%|          | 0/100 [00:00<?, ?it/s]

In [7]:
print("## Benchmarking on {}".format(benchmark))
print("| Model | it/s |  F@5 | F@10 |")
print("| :---- | ----:| ---: | ---: |")

# loop through the models
for model in outputs:
    
    f_scores = []
    # compute the P, R, F scores for the model
    for cutoff in [5, 10]:
        scores = []
        for i, output in enumerate(outputs[model]):
            scores.append(evaluate(output, references[i], cutoff))

        # compute the average scores
        P, R, F = np.mean(scores, axis=0)
        f_scores.append(F)
        
    print("| {}  | {:.1f} | {:.2f} | {:.2f} |".format(model,  len(test)/ elapsed_times[model], f_scores[0]*100, f_scores[1]*100))

## Benchmarking on semeval-2010-pre
| Model | it/s |  F@5 | F@10 |
| :---- | ----:| ---: | ---: |
| FirstPhrases  | 190.3 | 13.96 | 14.94 |
| TextRank  | 121.6 | 9.32 | 13.42 |
| SingleRank  | 116.9 | 11.55 | 16.29 |
| TopicRank  | 83.3 | 12.07 | 14.46 |
| PositionRank  | 115.7 | 12.54 | 17.32 |
| MultipartiteRank  | 60.9 | 13.84 | 15.61 |
| TfIdf  | 193.5 | 13.20 | 16.08 |
| TopicalPageRank  | 25.4 | 11.53 | 16.32 |


## Benchmarking on semeval-2010-pre
| Model | it/s |  F@5 | F@10 |
| :---- | ----:| ---: | ---: |
| FirstPhrases  | 190.3 | 13.96 | 14.94 |
| TextRank  | 121.6 | 9.32 | 13.42 |
| SingleRank  | 116.9 | 11.55 | 16.29 |
| TopicRank  | 83.3 | 12.07 | 14.46 |
| PositionRank  | 115.7 | 12.54 | 17.32 |
| MultipartiteRank  | 60.9 | 13.84 | 15.61 |
| TfIdf  | 193.5 | 13.20 | 16.08 |
| TopicalPageRank  | 25.4 | 11.53 | 16.32 |