# Vector Space Models

### Required Modules

In [33]:
%pip install nltk scikit-learn rank_bm25


import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

import os
import json

from typing import Dict, List, Union, Any


nltk.download('wordnet')
# %pip install rank_bm25 scikit-learn

# from rank_bm25 import BM25Okapi
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import nltk
# import json
# import os

# from typing import List, Tuple, Dict, Any

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package wordnet to /home/guaya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Data

In [34]:
DIRECTORY = "../data/processed/"


def load_json_data(directory: str=DIRECTORY) -> Dict[str, List[Any]]:
    """Load all JSON files from given directory."""
    json_data = {}

    for filename in os.listdir(directory):
        if filename.lower().endswith(".json"):

            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                json_data[filename[:-5]] = {"processed": json.load(f)}

    return json_data

Load processed data:

In [35]:
papers = load_json_data()

for identifier in papers:
    print(identifier)

2511.21678v1
2511.21636v1
2511.21591v1
2511.21570v1
2511.21569v1
2511.21522v1
2511.21471v1
2511.21460v1
2511.21444v1
2511.21398v1


### Tokenize and Lemmatize

In [36]:
def pre_process_paper(text: str) -> Dict[str, Union[str, List[List[str]]]]:
    """Process text into plain and lemmatized sentences."""
    paper = {}

    stop_words = set(nltk.corpus.stopwords.words("english"))
    lemmatizer = nltk.stem.WordNetLemmatizer()

    sentence_split = nltk.sent_tokenize(text)
    word_split = [
        nltk.word_tokenize(sentence) for
        sentence in sentence_split]

    plain = []
    lemma = []
    for i, sentence in enumerate(word_split):
        lemma.append([])
        plain.append(sentence_split[i].replace('\n', ''))

        for word in sentence:
            token = word.lower()
            if token.isalpha() and token not in stop_words:
                lemma[-1].append(lemmatizer.lemmatize(token))

        # Discard sentences where
        # lemmatization returns nothing
        if not lemma[-1]:
            lemma.pop()
            plain.pop()

    return {"lemma": lemma, "plain": plain }

Pre-process the text:

In [37]:
for identifier, data in papers.items():

    paper = data["processed"]
    paragraphs = []
    for section in paper:
        paragraphs.append(section["paragraphs"])

    papers[identifier]["sentences"] = pre_process_paper("\n".join(paragraphs))

## TF-IDF

In [38]:
def tfidf_rank_sentences(lemmatized: str) -> List[int]:
    """Order index of sentences by TF-IDF similarity to whole document."""
    vectorizer = TfidfVectorizer()

    sentences_list = [' '.join(s) for s in lemmatized]
    sentences = vectorizer.fit_transform(sentences_list)
    document = vectorizer.transform([' '.join(sentences_list)])
    scores = cosine_similarity(document, sentences).flatten()

    indexes = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)
    return indexes

Calculate TF-IDF scores:

In [39]:
for identifier, data in papers.items():
    sentences = data["sentences"]

    papers[identifier]["rank"] = {}
    papers[identifier]["rank"]["TF-IDF"] = tfidf_rank_sentences(sentences["lemma"])

## BM25

In [40]:
def bm25_rank_sentences(lemmatized: str) -> List[int]:
    """Order index of sentences by BM25 similarity to whole document."""
    sentences = BM25Okapi(lemmatized)
    scores = sentences.get_scores(sum(lemmatized, []))

    indexes = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)
    return indexes

Calculate BM25 scores:

In [41]:
for identifier, data in papers.items():
    sentences = data["sentences"]

    papers[identifier]["rank"]["BM25"] = bm25_rank_sentences(sentences["lemma"])

### Save Results

In [42]:
filepath = os.path.join("..", "data", "results.json")

with open(filepath, "w") as f:
    json.dump(papers, f, indent=2)

by [Manuel Velarde](mailto:manuel@velarde.me)