# Code for *How Research Works: An "Under the Hood" Exploration of Search Algorithms*

This is the companion code for a workshop done at the Vagelos Computational Science Center at Barnard College by Dan Woulfin and Sydni Meyer. 

It's split into two parts:
- A functional breakdown of boolean search
- An exploration of three information retrieval ranking/relevance algorithms: Jaccard similarity, TF-IDF, and BM25

We will be using a generated collection of documents as our data source. It's about Renaissance painters, the Teenage Mutant Ninja Turtles, and biological turtles. Please note that this is generated text and not verified facts about these topics. 

---

## Boolean search 

Boolean search is direct search meaning that it returns a set of matching documents. It does so through set theory, testing each boolean operation according to an order of operations.

### Import needed libraries

To keep this simple, we'll only be importing `re` to process regular expressions and pandas to create dataframes in the relevance section.

In [None]:
import re
import pandas as pd

### Boolean search functions

#### Query tokenization and preprocessing functions

These functions will tokenize the query and preprocess them to add an implicit OR when none exist and transform * & ? into the appropriate regex. 

In [None]:
# Tokenize the query into words, phrases, and operators
def tokenize_query(query):
    # Define regex patterns for various token types
    token_patterns = [
        r'\(',  # Parenthesis
        r'\)',  # Parenthesis
        r'"[^"]+"',  # Quoted strings
        r'[^) ]+', # everything that's not a space or a right parenthesis, including operators and wildcards
        r'\s+',  # Whitespace (to be ignored)
    ]

    # Combine the patterns into one single pattern
    combined_pattern = '|'.join(token_patterns)
    
    # Use finditer for better control (positioning and matching)
    tokens = []
    for match in re.finditer(combined_pattern, query):
        token = match.group(0).strip().lower() # Get the matched text and normalize it
        if token.strip():  # Ignore empty tokens
            tokens.append(token)
    return tokens

############################################
# Transformation function
############################################

# Handle implicit OR (e.g., "word1 word2" should be interpreted as "word1 OR word2")
# Also implicit AND before NOT unless the query starts with NOT
def add_implicit_boolean(tokens):
    processed_tokens = []
    prev_was_term = False  # Tracks if the last token was a word/phrase

    for token in tokens:
        if token == "not":
            # If the previous token is a term, insert implicit AND before NOT
            if prev_was_term:
                processed_tokens.append("and")  # Add implicit AND before NOT
            prev_was_term = False  # Reset, as NOT is an operator
        elif token not in {"and", "or", "not"} and token not in {"(", ")"}:
            if prev_was_term:
                processed_tokens.append("or")  # Insert implicit OR
            prev_was_term = True  # Mark that the last token was a term (word/phrase) 
        elif token == ")":  
            prev_was_term = True  # Consider closing parentheses as valid preceding terms
        else:
            prev_was_term = False  # Reset if an operator or parenthesis is found

        processed_tokens.append(token)

    return processed_tokens  # Return the processed tokens

# Returns full and partial matches of a token from document_words
def word_matches(token, document_words, partial = False):
    if "*" in token or "?" in token:  # Handle wildcards
        regex = token.replace("*", ".*").replace("?", ".")  # Convert to regex
        return any(re.match(regex, word) for word in document_words) 
    if partial: # if user chooses partial matches
        # Escape only the period character
        if "." in token:
            token = token.replace(".", r"\.")  # Escape the period by adding a backslash
        # returns True for partial matches 
        return any(re.match(token, word) for word in document_words) 
    else: # if user chooses full matches
        # remove punctuation to ensure exact matches
        document_words = {re.sub(r"[^\w\s]", "", word) for word in document_words}  
        return token in document_words   # Return True for exact matches

#### Evaluation functions

These functions evaluate the documents using boolean operations.

In [None]:
# Evaluate a Boolean operation
def evaluate(operator, operand1=None, operand2=None):
    if operator == "not":
        return not operand1
    if operator == "and":
        return operand1 and operand2
    if operator == "or":
        return operand1 or operand2



#### Processing functions

These functions test the processed query against a document

In [None]:
# Processes an operator from the stack and applies it to the operands.
# Returns the updated operand stack.
def apply_operator(operator_stack, bool_results_stack):
    if not operator_stack:
        return bool_results_stack  # No change if empty

    operator = operator_stack.pop()

    if operator == "not":
        result = evaluate(operator, bool_results_stack.pop())

    else:
        operand2 = bool_results_stack.pop()
        operand1 = bool_results_stack.pop()
        result = evaluate(operator, operand1, operand2)

    bool_results_stack.append(result)  # Push result back onto stack
    return operator_stack, bool_results_stack  # Return updated stacks

def process_query(query, document, partial = False):
    tokens = tokenize_query(query)
    tokens = add_implicit_boolean(tokens)
    
    bool_results_stack = []  # Stores boolean results
    operator_stack = []  # Stores operators

    # creating the operator stack while respecting the order of operations
    # first everything in parenthesis, then NOT, then AND/OR
    # reassigning operator_stack and bool_results_stack for clarity but it's not necessary
    for token in tokens:
        if token == "(":
            operator_stack.append(token)
        elif token == ")":
            # evaluate everything in parenthesis
            while operator_stack and operator_stack[-1] != "(":
                operator_stack, bool_results_stack = apply_operator(operator_stack, bool_results_stack) 
            if operator_stack:
                operator_stack.pop()  # Remove "("
        elif token == "not":
            operator_stack.append(token)
        elif token in {"and", "or"}:
            # apply not first
            while operator_stack and operator_stack[-1] == "not":
                operator_stack, bool_results_stack = apply_operator(operator_stack, bool_results_stack)
            # apply AND/OR next from left to right
            while operator_stack and operator_stack[-1] in {"and", "or"}:
                operator_stack, bool_results_stack = apply_operator(operator_stack, bool_results_stack)
            operator_stack.append(token)
        else:
            if token.startswith('"') and token.endswith('"'):  
                phrase = token.strip('"')
                bool_results_stack.append(phrase in document)
            else:
                document_words = document.split()
                bool_results_stack.append(word_matches(token, document_words, partial))

    # evaluating the operator stack by resolving the rest of it
    while operator_stack:
        operator_stack, bool_results_stack = apply_operator(operator_stack, bool_results_stack)

    return bool_results_stack.pop() if bool_results_stack else False

#### Clean and search all docs functions

Cleans documents to remove punctuation while searching all documents.

In [None]:
# Evaluate the query across all documents
def search_documents(query, documents, partial = False):
    results = []
    for doc in documents:
        if process_query(query, doc.lower(), partial):
            results.append(doc)
    return results

## Testing/example

In [None]:
# Example dataset
documents = [
    "Michelangelo Buonarroti was an Italian sculptor, painter, and architect, best known for painting the Sistine Chapel ceiling.",
    "Leonardo da Vinci was a polymath who painted the famous Mona Lisa and The Last Supper.",
    "Donatello was a sculptor known for his bronze statue of David, the first free-standing nude male sculpture since antiquity.",
    "Raphael was an Italian painter and architect of the High Renaissance, famous for his fresco The School of Athens.",
    "Michelangelo sculpted the marble statue David, which became a symbol of Florentine strength and beauty.",
    "Leonardo da Vinci filled notebooks with anatomical studies, engineering designs, and artistic sketches.",
    "Donatello’s Gattamelata is one of the earliest Renaissance equestrian statues, depicting a military leader.",
    "Raphael worked on the Vatican’s Stanza della Segnatura, producing masterpieces like The School of Athens.",
    "Michelangelo designed the dome of St. Peter’s Basilica in Vatican City, one of the most iconic architectural feats of the Renaissance.",
    "Leonardo da Vinci studied human anatomy by dissecting corpses to improve his artistic accuracy.",
    "Donatello pioneered the use of linear perspective in sculpture, particularly in his relief work The Feast of Herod.",
    "Raphael was known for his harmonious and balanced compositions, influenced by both Leonardo and Michelangelo.",
    "Michelangelo’s Pietà, housed in St. Peter’s Basilica, is a masterpiece of Renaissance sculpture depicting the Virgin Mary holding Jesus.",
    "Leonardo da Vinci’s Vitruvian Man illustrates ideal human proportions based on the writings of the ancient Roman architect Vitruvius.",
    "Donatello revived the lost-wax casting technique in bronze sculpture, making his works more detailed and expressive.",
    "Raphael’s Sistine Madonna features the famous cherubs that have been widely reproduced in modern culture.",
    "Michelangelo considered himself primarily a sculptor, even though he was also a renowned painter and architect.",
    "Leonardo da Vinci developed early concepts for machines like the helicopter and parachute, centuries ahead of their time.",
    "Donatello’s St. George statue in Florence showcases his skill in creating dynamic, lifelike figures.",
    "Raphael’s premature death at 37 was deeply mourned, and he was buried in the Pantheon in Rome.",
    "Titian was a Venetian Renaissance painter known for his use of vibrant colors and dynamic compositions, as seen in Assumption of the Virgin.",
    "Sandro Botticelli painted The Birth of Venus, which depicts the goddess emerging from the sea on a shell.",
    "Albrecht Dürer was a German Renaissance artist famous for his woodcuts, engravings, and self-portraits.",
    "Jan van Eyck, a Flemish painter, pioneered the use of oil paints in works like The Arnolfini Portrait.",
    "Hieronymus Bosch created surreal, fantastical paintings, including The Garden of Earthly Delights.",
    "Giorgione, a Venetian painter, is best known for his mysterious painting The Tempest.",
    "Masaccio, an early Renaissance painter, applied linear perspective in The Holy Trinity, making it one of the first paintings with true depth.",
    "Paolo Veronese created grand, elaborate compositions, such as The Wedding at Cana, which is displayed in the Louvre.",
    "Caravaggio, though from the later Baroque period, was heavily influenced by Renaissance techniques, particularly chiaroscuro.",
    "Pieter Bruegel the Elder specialized in detailed depictions of peasant life, as seen in The Peasant Wedding.",
    "Andrea Mantegna’s Lamentation of Christ is famous for its extreme use of foreshortening.",
    "Fra Angelico was a Dominican friar known for painting religious frescoes, including The Annunciation.",
    "Piero della Francesca’s The Flagellation of Christ is admired for its precise use of perspective.",
    "Filippo Brunelleschi, an architect, engineered the massive dome of Florence’s Cathedral, a masterpiece of Renaissance architecture.",
    "Lorenzo Ghiberti designed the bronze Gates of Paradise for the Baptistery of Florence.",
    "Domenico Ghirlandaio was Michelangelo’s teacher and painted The Adoration of the Magi.",
    "Benvenuto Cellini was a sculptor and goldsmith, best known for Perseus with the Head of Medusa.",
    "Giovanni Bellini was a Venetian painter who contributed to the development of rich color techniques in oil painting.",
    "Correggio’s Assumption of the Virgin uses dramatic foreshortening to create a dynamic, swirling composition.",
    "Tintoretto’s The Last Supper reimagined the biblical scene with dramatic lighting and motion, differing from Leonardo’s version.",
    "The Teenage Mutant Ninja Turtles (TMNT) were created by Kevin Eastman and Peter Laird in 1984 as a comic book series.",
    "The four turtles—Leonardo, Michelangelo, Donatello, and Raphael—are named after famous Renaissance artists.",
    "Splinter, their sensei and father figure, is a mutant rat who teaches them ninjutsu.",
    "The turtles’ main enemy is the Shredder, the leader of the Foot Clan.",
    "TMNT originally started as a dark and gritty comic but was later adapted into a more family-friendly animated series in 1987.",
    "The turtles live in the sewers of New York City and fight crime while staying hidden from society.",
    "April O’Neil, a journalist and ally of the turtles, often helps them in their battles against villains.",
    "Each turtle has a signature weapon: Leonardo wields katanas, Michelangelo uses nunchaku, Donatello fights with a bo staff, and Raphael has twin sai.",
    "Michelangelo is known for his love of pizza and his laid-back, fun-loving personality.",
    "The phrase Cowabunga! became one of the franchise’s most famous catchphrases, often said by Michelangelo.",
    "TMNT has been adapted into multiple animated series, live-action films, and video games over the decades.",
    "In the 1990 live-action Teenage Mutant Ninja Turtles movie, the turtles’ suits were created by Jim Henson’s Creature Shop.",
    "The 2003 animated series returned to a darker tone, closer to the original comics.",
    "The turtles' main ally in their fight against evil is Casey Jones, a hockey mask-wearing vigilante.",
    "Over the years, TMNT has remained a pop culture phenomenon, with new adaptations and reboots continuing to introduce the turtles to new generations.",
    "Leonardo is the leader of the Teenage Mutant Ninja Turtles and is known for his discipline and mastery of the katana swords.",
    "Michelangelo is the most fun-loving turtle, often cracking jokes and obsessing over pizza.",
    "Donatello is the team’s tech expert, constantly inventing gadgets and using his bo staff in battle.",
    "Raphael is the most aggressive and rebellious turtle, often clashing with Leonardo over leadership.",
    "Leonardo wears a blue mask and is deeply dedicated to following Splinter’s teachings.",
    "Michelangelo is the turtle who most frequently shouts “Cowabunga!” when excited.",
    "Donatello created the Turtle Van, a high-tech vehicle that helps the team travel through New York City.",
    "Raphael has a close friendship with Casey Jones, often teaming up with him in fights.",
    "Leonardo was once brainwashed by Shredder in some versions of the TMNT storylines.",
    "Michelangelo is often underestimated but has proven to be a skilled fighter when necessary.",
    "Leonardo is the disciplined leader of the Ninja Turtles, always striving to keep the team focused on their mission.",
    "Michelangelo and Donatello have a playful dynamic, with Mikey often testing Donnie’s patience with his goofy antics.",
    "Raphael, known for his temper, frequently clashes with Leonardo over leadership decisions.",
    "Donatello is the team’s tech genius, constantly inventing new gadgets to help Michelangelo, Raphael, and Leonardo in battle.",
    "Michelangelo is the turtle who loves pizza the most, though Raphael and Donatello don’t mind grabbing a slice.",
    "Leonardo and Raphael have a sibling rivalry, but deep down, they respect and protect each other.",
    "Donatello once built a special armored vehicle, the Shellraiser, for Leonardo, Michelangelo, and Raphael to use against the Foot Clan.",
    "Raphael is the toughest fighter of the group, but even he admits that Leonardo has the best strategy in battle.",
    "Michelangelo, Donatello, and Raphael sometimes tease Leonardo for being too serious, but they always follow his lead in tough situations.",
    "While Michelangelo loves to joke around, Leonardo, Donatello, and Raphael know they can always count on him when the fight gets serious.",
    "Turtles have been around for over 200 million years, making them one of the oldest reptile groups still roaming the planet.",
    "A turtle’s shell isn’t just armor—it’s part of its skeleton, so taking it off would be like removing its ribs!",
    "Some turtles, like the leatherback sea turtle, are world travelers, crossing entire oceans without a GPS.",
    "Turtles may look slow on land, but put them in water, and many become graceful, speedy swimmers.",
    "A tortoise might take its time, but don’t mistake that for laziness—it’s all about patience and wisdom.",
    "Sea turtles can hold their breath for hours, making them the ultimate underwater meditation masters.",
    "If a turtle could talk, it might brag about its long lifespan—some live over 150 years!",
    "Many turtles have excellent night vision, perfect for sneaking around in the moonlight like little reptilian ninjas.",
    "Unlike humans, turtles don’t have vocal cords, but that doesn’t stop them from making little grunts, hisses, and even some underwater sounds.",
    "The alligator snapping turtle looks like a prehistoric warrior, with its spiky shell and powerful beak.",
    "Baby sea turtles hatch from their eggs and immediately embark on an epic race to the ocean, dodging hungry predators along the way.",
    "Some turtles, like the red-eared slider, love basking in the sun, soaking up rays like tiny, sun-worshipping yogis.",
    "The painted turtle has antifreeze-like blood, allowing it to survive in icy waters during winter—talk about cold-blooded resilience!",
    "A turtle’s sense of smell is incredible, helping it sniff out food, friends, and danger from surprising distances.",
    "While turtles don’t rush through life, their steady and determined nature has made them symbols of wisdom, perseverance, and patience across many cultures."
]



Below is where we do the search. Feel free to change the query in teh single quotes below to see what kind of searches you can do with the documents!

For example: 

- Can you just get articles on Michelangelo the painter? 
- Or facts about turtles without getting documents on the Teenage Mutant Ninja Turtles?

In [None]:
# Example query
query = 'renaissance Michelangelo "italian sculptor"'

# Perform the search
results = search_documents(query, documents, partial = False)

# Display the results
print("Documents matching the query:")
for result in results:
    print("-", result)



---
# Ranking / Relevance examples

This section is going to open up a whole can of worms.
- Do we want to get partial or full matches? 
- How do we handle wildcards and truncation?
- Do we want to rank the results we get or do a new search?
- At what point do we cap results?
- How can we compare the relevance algorithms?

I made some choices to make this easier, namely yes to phrases and avoiding truncation, wildcards, and parenthesis for now. We'll need to use new queries.

---

## New functions

In [None]:
def get_phrases(query):
    # find all phrases
    phrases = re.findall(r'"([^"]*)"', query)
        
    return phrases  

# Tokenize and preprocess documents (remove punctuation and lowercasing)
def preprocess(query, doc = None):
    # Step 1: Extract phrases from the document
    phrases = get_phrases(query)  # Get all quoted phrases

    if doc:
        # Step 2: Replace phrases in the document with placeholders so we don't tokenize them
        for i, phrase in enumerate(phrases):
            doc = doc.lower().replace(f'{phrase}', f'__phrase{i}__')  # Replace phrases with unique placeholders

        # Step 3: Tokenize the rest of the document (after replacing the phrases)
        tokens = [word.lower() for word in doc.split(' ')]

        # Step 4: Replace the placeholders with the actual phrases
        final_tokens = []
        for token in tokens:
            if token.startswith("__phrase"):  # If it's a placeholder for a phrase
                phrase_index = int(token.split('phrase')[1].split('__')[0])  # Get the index of the phrase
                final_tokens.append(phrases[phrase_index])  # Replace placeholder with actual phrase
            else:
                final_tokens.append(token)

        return final_tokens


## Define Your Query

Again, feel free to change this to try different searches. Are there any surprises in the rankings? 

In [None]:
query = 'renaissance Michelangelo "italian sculptor"'

## Jaccard Set Similarity

This divides the tokens in both the query set and document set (the intersection) with the total number of tokens in the query set and document set. 

In [None]:
# Function to calculate Jaccard Similarity 
# incorporating word_matches for partial and wildcard matching
def jaccard_similarity(query_set, doc_set):
    # Calculate the intersection using word_matches for partial matching
    # This is the boolean AND
    intersection = sum(1 for term in query_set if 
                       any(word_matches(term, doc_set) 
                           for word in doc_set))

    # Calculate the union using word_matches for partial matching
    # This is the boolean OR
    union = len(query_set) + len(doc_set) - intersection

    return intersection / union if union > 0 else 0  # Avoid division by zero

# Convert query and documents into sets of terms (tokens)
query_set = tokenize_query(query)

# Replace all double quotes in the extracted phrases
query_set = [phrase.replace('"', '') for phrase in query_set]

# List to store Jaccard similarity scores for each document
similarity_scores = []

# Compute Jaccard similarity for each document
for doc in documents:
    doc_set = preprocess(query, doc)  # Convert document to a set of terms
    score = jaccard_similarity(query_set, doc_set)
    similarity_scores.append(score)

# Combine documents with their similarity scores
doc_scores = list(zip(documents, similarity_scores))

# Sort documents by their similarity score in descending order
sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)

# Prepare data for Jaccard ranking
jaccard_data = {
    "Document": [],
    "Token Length": [],
    "Rank": [],
    "Similarity": []
}

for idx, (doc, score) in enumerate(sorted_doc_scores, start=1):
    if idx > 26:  # Limit to the top 10 results
        break
    jaccard_data["Rank"].append(idx)
    jaccard_data["Document"].append(doc)
    jaccard_data["Similarity"].append(f"{score:.4}")
    jaccard_data["Token Length"].append(len(preprocess(query, doc)))

# Create pandas DataFrame for Jaccard results
jaccard_df = pd.DataFrame(jaccard_data)

jaccard_df.head(20)


# TF-IDF

TF-IDF stands for term frequency-inverse document frequency. It penalizes the term frequency against the number of documents a term is in. 

In the query: **renaissance Michalangelo "italian sculptor"** each document only has a phrase appear once. The document frequency however is
- Michelangelo = 29
- Renaissance = 16
- italian sculptor = 1

Michelangelo will penalized the most and 'italian sculptor' the least. 

In [None]:
# Get phrases from the query
phrases = get_phrases(query)

for i, phrase in enumerate(phrases):
    query_processed = query.lower().replace(phrase, "__phrase__" + str(i)).replace('"', '')

# Create a new list of processed documents
documents_processed = []
for doc in documents:
    processed_doc = doc.lower()
    for i, phrase in enumerate(phrases):
        processed_doc = processed_doc.replace(phrase.lower(), "__phrase__" + str(i))
    documents_processed.append(processed_doc)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(documents_processed)

# Transform the query into a TF-IDF vector (using the same vectorizer)
query_vector = vectorizer.transform([query_processed])

# Compute cosine similarity between the query and all documents
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]

# Rank documents based on similarity scores
tfidf_ranked_indices = np.argsort(similarity_scores)[::-1]  # Sort in descending order
tfidf_ranked_scores = similarity_scores[tfidf_ranked_indices]

# Prepare data for TF-IDF ranking
tfidf_data = {
    "Rank": [],
    "Score": [],
    "Document": []    
}

for i, (idx, score) in enumerate(zip(tfidf_ranked_indices, tfidf_ranked_scores)):
    if i >= 25:  # Limit to the top 25 results
        break
    tfidf_data["Rank"].append(i+1)
    tfidf_data["Score"].append(f"{score:.4f}")
    tfidf_data["Document"].append(documents[idx])

# Create pandas DataFrame for TF-IDF results
tfidf_df = pd.DataFrame(tfidf_data)

tfidf_df.head(20)

## BM25

BM25 takes TF-IDF and normalizes based on document length. It's a probabilistic model and has parameters that can be tuned.

In the query: **renaissance Michalangelo "italian sculptor"** each document only has a phrase appear once. The document frequency however is
- Michelangelo = 29
- Renaissance = 16
- italian sculptor = 1

The minimum number of tokens is 11 and the maximum is 23. The average is 16.7 tokens.

Michelangelo will be penalized more than Renaissance but if either term appears in a short document then it might score higher than documents with two terms. 

This is why the below is going to have different results than TF-IDF. Rarity will matter less if the document length is average or high. 



In [None]:
from rank_bm25 import BM25Okapi

# Tokenize the processed documents
tokenized_documents = [doc.split(" ") for doc in documents_processed]

# tokenize processed query
tokenized_query = query_processed.split(" ")

# Initialize the BM25 model
bm25 = BM25Okapi(tokenized_documents)

####################################################################
# Set parameters for BM25 Okapi
# These are defaults

bm25.k1 = 1.5  # Term frequency saturation
# A higher k1 means that it's more sensitive to 
# term frequencies. 

bm25.b = 0.75  # Length normalization
# A higher b means that it normalizes less for document length, 
# meaning longer documents will be less penalized.
#####################################################################

# Get the BM25 scores for each document based on the query
scores = bm25.get_scores(tokenized_query)

# Retrieve the documents with the highest BM25 scores
bm25_ranked_docs = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)

# Prepare data for BM25 ranking
bm25_data = {
    "Document": [],
    "Length": [],
    "Rank": [],
    "Score": [],
}

counter = 1
for idx, score in bm25_ranked_docs:
    if counter > 25:  # Limit to the top 25 results
        break
    bm25_data["Rank"].append(counter)
    bm25_data["Score"].append(f"{score:.4f}")
    bm25_data["Document"].append(documents[idx])
    bm25_data["Length"].append(len(documents_processed[idx].split()))
    counter += 1

# Create pandas DataFrame for BM25 results
bm25_df = pd.DataFrame(bm25_data)

bm25_df.head(20)



# Conclusion - different algorithm choices lead to different rankings

These numbers are manipulatable and there are entire fields like Search Engine Optimization designed to get to the top of certain rankings and not another. How does Jaccard, TF-IDF, and BM25 rank things differently from your search? Why might you choose one and not another?

In [None]:
# Rename the columns for clarity
jaccard_df = jaccard_df.rename(columns={"Rank": "Jaccard Rank", "Similarity": "Jaccard Similarity"})
tfidf_df = tfidf_df.rename(columns={"Rank": "TF-IDF Rank", "Score": "TF-IDF Score"})
bm25_df = bm25_df.rename(columns={"Rank": "BM25 Rank", "Score": "BM25 Score", "Length": "Token Length"})

# Merge the DataFrames on the 'Document' column
merged_df = pd.merge(jaccard_df, tfidf_df, on="Document", how="outer")

merged_df.head(10)

merged_df = pd.merge(merged_df, bm25_df, on=["Document", "Token Length"], how="outer")

print("Results for query:", query) 

# Print the merged DataFrame
merged_df.head(20)