# Vector Space Models

### Required Modules

In [16]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from rouge_score import rouge_scorer
from bert_score import score as bert_scorer

import os
import json

from typing import Dict, List, Union, Any


nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/guaya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Data

Load processed data:

In [17]:
DIR_DATA = os.path.join("..", "data")
DIR_PROCESSED = os.path.join(DIR_DATA, "processed")


papers = {}
for filename in os.listdir(DIR_PROCESSED):
    if filename.lower().endswith(".json"):

        filepath = os.path.join(DIR_PROCESSED, filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            papers[filename[:-5]] = {"processed": json.load(f)}

for identifier in papers:
    print(identifier)

2511.21678v1
2511.21636v1
2511.21591v1
2511.21570v1
2511.21569v1
2511.21522v1
2511.21471v1
2511.21460v1
2511.21444v1
2511.21398v1


Load abstracts:

In [18]:
DIR_RAW = os.path.join(DIR_DATA, "raw", "abstracts")


for identifier, paper in papers.items():
    filepath = os.path.join(DIR_RAW, identifier + ".txt")

    with open(filepath,'r', encoding="utf-8") as f:
        paper["abstract"] = f.read()

### Tokenize and Lemmatize

In [19]:
def pre_process_paper(text: str) -> Dict[str, Union[str, List[List[str]]]]:
    """Process text into plain and lemmatized sentences."""
    paper = {}

    stop_words = set(nltk.corpus.stopwords.words("english"))
    lemmatizer = nltk.stem.WordNetLemmatizer()

    sentence_split = nltk.sent_tokenize(text)
    word_split = [
        nltk.word_tokenize(sentence) for
        sentence in sentence_split]

    plain = []
    lemma = []
    for i, sentence in enumerate(word_split):
        lemma.append([])
        plain.append(sentence_split[i].replace('\n', ''))

        for word in sentence:
            token = word.lower()
            if token.isalpha() and token not in stop_words:
                lemma[-1].append(lemmatizer.lemmatize(token))

        # Discard sentences where
        # lemmatization returns nothing
        if not lemma[-1]:
            lemma.pop()
            plain.pop()

    return {"lemma": lemma, "plain": plain }

Pre-process the text:

In [20]:
for identifier, data in papers.items():

    paper = data["processed"]
    paragraphs = []
    for section in paper:
        paragraphs.append(section["paragraphs"])

    papers[identifier]["sentences"] = pre_process_paper("\n".join(paragraphs))
    papers[identifier]["abstract"] = pre_process_paper(papers[identifier]["abstract"])

## TF-IDF

In [21]:
def tfidf_rank_sentences(lemmatized: str) -> List[int]:
    """Order index of sentences by TF-IDF similarity to whole document."""
    vectorizer = TfidfVectorizer()

    sentences_list = [' '.join(s) for s in lemmatized]
    sentences = vectorizer.fit_transform(sentences_list)
    document = vectorizer.transform([' '.join(sentences_list)])
    scores = cosine_similarity(document, sentences).flatten()

    indexes = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)
    return indexes

Calculate TF-IDF scores:

In [22]:
for identifier, data in papers.items():
    sentences = data["sentences"]

    papers[identifier]["rank"] = {}
    papers[identifier]["rank"]["TF-IDF"] = tfidf_rank_sentences(sentences["lemma"])

Calculate similarity scores:

In [24]:
def build_summary(
        sentences: List[List[str]],
        indexes: List[int]
    ) -> str:
    """Build summary from TF-IDF or BM25 ranking."""
    return ' '.join([sentences[s] for s in indexes])


def rouge_abstract_similarity(
        summary_str: str,
        abstract: List[str]
    ) -> Dict[str, float]:
    """Calculate ROUGE similarities between summary and abstract."""
    abstract_str = ' '.join(abstract)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    score = scorer.score(abstract_str, summary_str)
    return {
        "rouge1": score["rouge1"].fmeasure,
        "rouge2": score["rouge2"].fmeasure,
        "rougeL": score["rougeL"].fmeasure}


def bert_abstract_similarity(
        summary_str,
        abstract: List[str]
    ) -> float:
    """Calculate BertScore similarity between summary and abstract."""
    abstract_str = ' '.join(abstract)

    _, _, F1 = bert_scorer(
        [summary_str],
        [abstract_str],
        lang="en",
        model_type="distilbert-base-uncased")
    return float(F1[0])


for identifier, paper in papers.items():

    paper["score"] = {}
    n = len(paper["abstract"]["plain"])
    summary = build_summary(
        paper["sentences"]["plain"],
        paper["rank"]["TF-IDF"][:n])

    paper["score"]["TF-IDF"] = rouge_abstract_similarity(
        summary,
        paper["abstract"]["plain"])

    paper["score"]["TF-IDF"]["bertscore_f1"] = bert_abstract_similarity(
        summary,
        paper["abstract"]["plain"])
        
    print(identifier, paper["score"]["TF-IDF"])

2511.21678v1 {'rouge1': 0.3203592814371257, 'rouge2': 0.08408408408408408, 'rougeL': 0.1467065868263473, 'bertscore_f1': 0.7764676213264465}
2511.21636v1 {'rouge1': 0.3690036900369004, 'rouge2': 0.10408921933085502, 'rougeL': 0.1845018450184502, 'bertscore_f1': 0.7698068022727966}
2511.21591v1 {'rouge1': 0.36363636363636365, 'rouge2': 0.10953346855983773, 'rougeL': 0.19797979797979795, 'bertscore_f1': 0.7866191267967224}
2511.21570v1 {'rouge1': 0.4650205761316873, 'rouge2': 0.0950413223140496, 'rougeL': 0.2139917695473251, 'bertscore_f1': 0.8214453458786011}
2511.21569v1 {'rouge1': 0.21052631578947367, 'rouge2': 0.03571428571428572, 'rougeL': 0.12280701754385966, 'bertscore_f1': 0.7886336445808411}
2511.21522v1 {'rouge1': 0.4268292682926829, 'rouge2': 0.11020408163265306, 'rougeL': 0.19105691056910568, 'bertscore_f1': 0.8165643811225891}
2511.21471v1 {'rouge1': 0.4157303370786517, 'rouge2': 0.12781954887218047, 'rougeL': 0.1797752808988764, 'bertscore_f1': 0.8021263480186462}
2511.2146

## BM25

In [25]:
def bm25_rank_sentences(lemmatized: str) -> List[int]:
    """Order index of sentences by BM25 similarity to whole document."""
    sentences = BM25Okapi(lemmatized)
    scores = sentences.get_scores(sum(lemmatized, []))

    indexes = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)
    return indexes

Calculate BM25 scores:

In [26]:
for identifier, data in papers.items():
    sentences = data["sentences"]

    papers[identifier]["rank"]["BM25"] = bm25_rank_sentences(sentences["lemma"])

Calculate similarity scores:

In [27]:
for identifier, paper in papers.items():

    n = len(paper["abstract"]["plain"])
    summary = build_summary(
        paper["sentences"]["plain"],
        paper["rank"]["BM25"][:n])

    paper["score"]["BM25"] = rouge_abstract_similarity(
        summary,
        paper["abstract"]["plain"])

    paper["score"]["BM25"]["bertscore_f1"] = bert_abstract_similarity(
        summary,
        paper["abstract"]["plain"])
        
    print(identifier, paper["score"]["BM25"])

2511.21678v1 {'rouge1': 0.14302741358760426, 'rouge2': 0.0047789725209080045, 'rougeL': 0.06436233611442194, 'bertscore_f1': 0.6160234212875366}
2511.21636v1 {'rouge1': 0.144, 'rouge2': 0.004016064257028113, 'rougeL': 0.08399999999999999, 'bertscore_f1': 0.6361541152000427}
2511.21591v1 {'rouge1': 0.38223938223938225, 'rouge2': 0.09302325581395349, 'rougeL': 0.17374517374517373, 'bertscore_f1': 0.7891222238540649}
2511.21570v1 {'rouge1': 0.4787018255578094, 'rouge2': 0.10183299389002036, 'rougeL': 0.2150101419878296, 'bertscore_f1': 0.8201526999473572}
2511.21569v1 {'rouge1': 0.19298245614035087, 'rouge2': 0.01785714285714286, 'rougeL': 0.15789473684210525, 'bertscore_f1': 0.7706606388092041}
2511.21522v1 {'rouge1': 0.4294234592445328, 'rouge2': 0.11576846307385229, 'rougeL': 0.18687872763419483, 'bertscore_f1': 0.8104702830314636}
2511.21471v1 {'rouge1': 0.4307116104868913, 'rouge2': 0.13157894736842102, 'rougeL': 0.17228464419475656, 'bertscore_f1': 0.8052541613578796}
2511.21460v1 {

## Extra: Lead N & Lead N by Section

Calculating the scores for the _Lead N_ and _Lead N by Section_ predictions:

In [28]:
filepath = os.path.join(DIR_DATA, "lead_n_results.json")
with open(filepath, 'r', encoding='utf-8') as f:
    lead_n_results = json.load(f)

for identifier in papers:
    papers[identifier]["lead_n"] = lead_n_results[identifier]["lead_n"]
    papers[identifier]["lead_n_by_section"] = lead_n_results[identifier]["lead_n_by_section"]

Calculate similarity scores:

In [30]:
print("Lead N")
for identifier, paper in papers.items():

    paper["score"]["lead_n"] = rouge_abstract_similarity(
        papers[identifier]["lead_n"],
        paper["abstract"]["plain"])

    paper["score"]["lead_n"]["bertscore_f1"] = bert_abstract_similarity(
        papers[identifier]["lead_n"],
        paper["abstract"]["plain"])
        
    print(identifier, paper["score"]["lead_n"])

print("Lead N by Section")
for identifier, paper in papers.items():

    paper["score"]["lead_n_by_section"] = rouge_abstract_similarity(
        papers[identifier]["lead_n_by_section"],
        paper["abstract"]["plain"])

    paper["score"]["lead_n_by_section"]["bertscore_f1"] = bert_abstract_similarity(
        papers[identifier]["lead_n_by_section"],
        paper["abstract"]["plain"])
        
    print(identifier, paper["score"]["lead_n_by_section"])

Lead N
2511.21678v1 {'rouge1': 0.31216931216931215, 'rouge2': 0.05319148936170213, 'rougeL': 0.13756613756613756, 'bertscore_f1': 0.7616528272628784}
2511.21636v1 {'rouge1': 0.35782747603833864, 'rouge2': 0.07717041800643087, 'rougeL': 0.17252396166134185, 'bertscore_f1': 0.788264274597168}
2511.21591v1 {'rouge1': 0.3409836065573771, 'rouge2': 0.07260726072607261, 'rougeL': 0.15081967213114755, 'bertscore_f1': 0.7599596977233887}
2511.21570v1 {'rouge1': 0.23300970873786403, 'rouge2': 0.032573289902280124, 'rougeL': 0.11650485436893201, 'bertscore_f1': 0.7540249228477478}
2511.21569v1 {'rouge1': 0.3170731707317073, 'rouge2': 0.07407407407407408, 'rougeL': 0.15853658536585366, 'bertscore_f1': 0.8002178072929382}
2511.21522v1 {'rouge1': 0.3095238095238095, 'rouge2': 0.023952095808383235, 'rougeL': 0.13095238095238096, 'bertscore_f1': 0.7477512955665588}
2511.21471v1 {'rouge1': 0.26315789473684215, 'rouge2': 0.05291005291005291, 'rougeL': 0.11052631578947368, 'bertscore_f1': 0.729740858078

### Save Results

In [31]:
filepath = os.path.join("..", "data", "tfidf_bm25_lead_results.json")

with open(filepath, "w") as f:
    json.dump(papers, f, indent=2)

by [Manuel Velarde](mailto:manuel@velarde.me)