# Data Encoding and Retrieval

### Required Modules

In [16]:
%pip install rank_bm25 scikit-learn

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import json
import os

from typing import List, Tuple

Note: you may need to restart the kernel to use updated packages.


### Load Data

In [8]:
DIR = "../data/"


def load_literature_texts(
        dir: str=DIR
    ) -> Tuple[List[List[str]], List[List[str]]]:
    """Load and return all literature papers in given directory."""
    papers = []
    titles = []
    directory = os.path.join(dir, "literature/")

    # Get all JSON files in `dir`
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)

            with open(filepath, "r", encoding="utf-8") as f:
                document = json.load(f)

                # Flatten sentences into one list per paper
                papers.append(sum(document["text"], []))
                titles.append(document["name"])
    
    return papers, titles


def load_economics_texts(dir: str=DIR) -> List[List[str]]:
    """Load and return all economics papers in given directory."""
    raise NotImplementedError("How is the eco data stored?")


Load literature papers:

In [9]:
texts, titles = load_literature_texts()
print(len(texts), len(titles))

23 23


### Test Data

The dataset includes a limited number of articles featuring _Abstracts_, which are essential for evaluating the summarization process in the next step of the _RAG summarization_ pipeline.
This articles will be the ones utilized as test samples.
Meanwhile, the remaining articles and books are intentionally retained to introduce noise, enabling us to assess the performance of the text retrieval process.

In [11]:
CURATED_TEST_QUERIES = [
    (
        # Medium
        # Topic tangentially mentioned in other samples
        "What is the role of popular romance in the spread of capitalist ideas in our culture?",
        "Women and Romantic Fiction: A Case Study of Harlequin Enterprises, Romances, and Readers"
    ),
    (
        # Hard
        # General question from topic discussed in
        # several samples
        "What are common sexual themes in romance novels?",
        "The Language of Romance: An Open Vocabulary Analysis of the Highest Rated Words Used in Romance Novels"
    ),
    (
        # Medium
        # Topic tangentially mentioned in other samples
        "How does linguistic expression of emotion by American and British authors compare?",
        "The Expression of Emotions in 20th Century Books"
    ),
    (
        # Easy
        # Depends on the proper noun Smith likely
        # only mentioned in the expected article
        "Are there similarities between the views of Mary Wollstonecraft and Adam Smith?",
        "The Ethics and Economics of Middle Class Romance"
    ),
    (
        # Easy
        # Topic specific to this sample
        "Do amateur writers represent gender and sexual stereotypes differently?",
        "Shirtless and Dangerous: Quantifying Linguistic Signals of Gender Bias in an Online Fiction Writing Community"
    ),
    (
        # Easy
        # Topic specific to this sample
        "What is the connection between narrative perspective and objectification?",
        "Reflecting the Male Gaze: Quantifying Female Objectification in 19th and 20th Century Novels"
    ),
    (
        # Medium
        # Topic tangentially mentioned in other samples
        "How do Jane Austen's works challenge Regency social norms?",
        "Negotiating Femininity: Expressions of Gender and the Policing of Female Hierarchies in Jane Austen's Pride and Prejudice"
    ),
    (
        # Easy
        # Topic specific to this sample
        "How does stereotypical depiction of gender differences affect ratings of romance fiction books?",
        "How Male and Female Literary Authors Write About Affect Across Cultures and Over Historical Periods"
    ),
    (
        # Medium
        # Topic tangentially mentioned in other samples
        "How are sexual scenarios described in romance literature genres?",
        "Exploring Consent: An Analysis of Consent in Dark Romance and Contemporary Romance Books"
    ),
    (
        # Hard
        # General question from topic discussed in
        # several samples
        "What are popular types of sexual violence in narrative fiction?",
        "A Parody of Love: the Narrative Uses of Rape in Popular Romance"
    )
]

With the help of the _OpenAI GPT-4.1_ model, we are formulate similar questions to the ones used in each test query string in order to expand our test data.

In [12]:
SYNTHETIC_TEST_QUERIES = [
    (
        "In what ways do the ideas of Mary Wollstonecraft and Adam Smith overlap?",
        "The Ethics and Economics of Middle Class Romance"
    ),
    (
        "How do the perspectives of Mary Wollstonecraft compare to those of Adam Smith regarding society and human nature?",
        "The Ethics and Economics of Middle Class Romance"
    ),
    (
        "In what ways do Jane Austen's novels critique the gender expectations of the Regency era?",
        "Negotiating Femininity: Expressions of Gender and the Policing of Female Hierarchies in Jane Austen's Pride and Prejudice"
    ),
    (
        "How does Jane Austen subvert the traditional roles assigned to women in early 19th-century English society through her writing?",
        "Negotiating Femininity: Expressions of Gender and the Policing of Female Hierarchies in Jane Austen's Pride and Prejudice"
    ),
]

### Query Pre-processing

Query tokenizer:

In [23]:
def pre_process_text(query: str) -> List[str]:
    """Split text into words and lemmatize"""
    if query is None:
        return []

    stop_words = set(nltk.corpus.stopwords.words("english"))
    lemmatizer = nltk.stem.WordNetLemmatizer()

    sentence = nltk.word_tokenize(query)

    result = []
    for word in sentence:
        token = word.lower()
        if token.isalpha() and token not in stop_words:
            result.append(lemmatizer.lemmatize(token))
            
    return result


def pre_process_tfidf(query: str) -> List[str]:
    """Prepare query for TF-IDF method"""
    return [" ".join(pre_process_text(query))]

## TF-IDF

Join the text for `TfidfVectorizer`:

In [43]:
# Build TF-IDF model
vectorizer = TfidfVectorizer()
tfidf_texts = [' '.join(text) for text in texts]
tfidf = vectorizer.fit_transform(tfidf_texts)

results = []
for test in CURATED_TEST_QUERIES:

    # Pre-process TF-IDF query
    tfidf_query = vectorizer.transform(pre_process_tfidf(test[0]))

    # Calculate scores
    scores = cosine_similarity(tfidf_query, tfidf).flatten()

    # Select top scores
    top_n = 5
    top_n_indices = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)[:top_n]

    # Save results
    rank = -1
    for r, i in enumerate(top_n_indices):
        if titles[i] == test[1]:
            rank = r + 1

        # Print rankings
        # print(f"{scores[i]:.2f}  |  {titles[i]}")

    results.append(rank)

print("TF-IDF results:\n", results)

TF-IDF results:
 [-1, 1, 1, 1, 1, 1, 4, 2, 5, 3]


## BM25

In [44]:
# Build BM25 model
bm25 = BM25Okapi(texts)

results = []
for test in CURATED_TEST_QUERIES:

    # Pre-process BM25 query
    bm25_query = pre_process_text(test[0])

    # Calculate scores
    scores = bm25.get_scores(bm25_query)

    # Select top scores
    top_n = 5
    top_n_indices = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True)[:top_n]

    # Save results
    rank = -1
    for r, i in enumerate(top_n_indices):
        if titles[i] == test[1]:
            rank = r + 1

        # Print rankings
        # print(f"{scores[i]:.2f}  |  {titles[i]}")

    results.append(rank)

print("BM25 results:\n", results)


BM25 results:
 [1, 1, 1, 1, 1, 1, 1, 1, 2, 3]


The End.