# Load a spacy model with in-words hyphens fix

In [1]:
import spacy
from spacy.tokenizer import _get_regex_pattern

nlp = spacy.load("en_core_web_sm")

# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept 
# as one token instead of default spacy behavior of 'non', '-', 'linear')
# https://spacy.io/usage/linguistic-features#native-tokenizer-additions

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

# Pre-process train and test splits using spacy

In [9]:
from tqdm.notebook import tqdm
from datasets import load_dataset

# load the inspec dataset
benchmark = "inspec"
dataset = load_dataset('taln-ls2n/inspec')

# pre-process training and test splits
train = []
for sample in tqdm(dataset['train']):
    train.append(nlp(sample["title"]+". "+sample["abstract"]))
    
test = []
for sample in tqdm(dataset['test']):
    test.append(nlp(sample["title"]+". "+sample["abstract"]))



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

# Compute DF counts (TfIdf) and LDA model (TopicalPageRank)

In [4]:
from pke import compute_document_frequency, compute_lda_model
from string import punctuation

compute_document_frequency(
    documents=train,
    output_file="data/{}.df.gz".format(benchmark), 
    language='en',                   # language of the input files
    normalization='stemming',        # use porter stemmer
    stoplist=list(punctuation),      # stoplist (punctuation marks)
    n=5                              # compute n-grams up to 5-grams
)

compute_lda_model(
    documents=train,
    output_file="data/{}.lda.pickle.gz".format(benchmark),
    n_topics=1000,              # number of topics
    language='en',              # language of the input files
    stoplist=list(punctuation), # stoplist (punctuation marks)
    normalization='stemming'    # use porter stemmer
)



# Load DF counts and LDA model

In [10]:
from pke import load_document_frequency_file, load_lda_model

df = load_document_frequency_file(input_file='data/{}.df.gz'.format(benchmark))
lda_model = load_lda_model(input_file="data/{}.lda.pickle.gz".format(benchmark))

In [None]:
from pke.unsupervised import *
from timeit import default_timer as timer

outputs = {}
elapsed_times = {}
for model in [FirstPhrases, TextRank, SingleRank, TopicRank, PositionRank, MultipartiteRank, TfIdf, TopicalPageRank]:
    outputs[model.__name__] = []
    
    extractor = model()
    start = timer()
    for i, doc in enumerate(tqdm(test)):
        extractor.load_document(input=doc, language='en')
        extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
        if model.__name__ in ["TfIdf"]:
            extractor.candidate_weighting(df=df)
        elif model.__name__ in ["TopicalPageRank"]:
            extractor.candidate_weighting(lda_model=lda_model)
        else:
            extractor.candidate_weighting()
        outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=10, stemming=True)])
    end = timer()
    elapsed_times[model.__name__] = end - start

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]



In [None]:
from nltk.stem.snowball import SnowballStemmer as Stemmer
import numpy as np
    
# populates the references list with stemmed keyphrases
references = []
for sample in tqdm(dataset['test']):
    sample_keyphrases = []
    for keyphrase in sample["keyphrases"]:
        # tokenize keyphrase
        tokens = [token.text for token in nlp(keyphrase)]
        # normalize tokens using Porter's stemming
        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]
        sample_keyphrases.append(" ".join(stems))
    references.append(sample_keyphrases)

def evaluate(top_N_keyphrases, references, cutoff=5):
    P = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(top_N_keyphrases[:cutoff])
    R = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(references)
    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 
    return (P, R, F)

In [41]:
print("## Benchmarking on {}".format(benchmark))
print("| Model | it/s |  F@5 | F@10 |")
print("| :---- | ----:| ---: | ---: |")

# loop through the models
for model in outputs:
    
    f_scores = []
    # compute the P, R, F scores for the model
    for cutoff in [5, 10]:
        scores = []
        for i, output in enumerate(outputs[model]):
            scores.append(evaluate(output, references[i], cutoff))

        # compute the average scores
        P, R, F = np.mean(scores, axis=0)
        f_scores.append(F)
        
    print("| {}  | {:.1f} | {:.2f} | {:.2f} |".format(model,  len(test)/ elapsed_times[model], f_scores[0]*100, f_scores[1]*100))


        # print out the performance of the model
        #print("{} at {} P: {:.3f} R: {:.3f} F: {:.3f}".format(model, cutoff, avg_scores[0], avg_scores[1], avg_scores[2]))

## benchmarking on semeval-2010-pre
| Model | it/s |  F@5 | F@10 |
| :---- | ----:| ---: | ---: |
| FirstPhrases  | 368.2 | 13.00 | 14.25 |
| TextRank  | 299.6 | 8.85 | 12.97 |
| SingleRank  | 288.0 | 11.11 | 16.23 |
| TopicRank  | 190.3 | 11.18 | 13.81 |
| PositionRank  | 280.4 | 12.72 | 16.64 |
| MultipartiteRank  | 145.9 | 12.99 | 14.93 |
| TfIdf  | 422.1 | 12.41 | 14.90 |
| TopicalPageRank  | 37.1 | 11.40 | 16.04 |


| Model | F@5 | F@10 |
| :---  | --: | ---: |
| FirstPhrases  | 0.24 | 0.29 |
| TextRank  | 0.27 | 0.34 |
| SingleRank  | 0.27 | 0.34 |
| TopicRank  | 0.25 | 0.28 |
| PositionRank  | 0.28 | 0.33 |
| MultipartiteRank  | 0.25 | 0.29 |
| TfIdf  | 0.28 | 0.35 |
| TopicalPageRank  | 0.28 | 0.34 |