# BM25

This notebook uses the BM25 ranking formula for information retrieval of documents based on a query search. The documents are scored and ranked for similarity against a collection of queries.

## Imports and setup

In [None]:
import nltk
import math
import numpy as np
import pandas as pd
import csv
import os
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.text import log

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Part 1 - Ranking by document titles
In this section we score each search query for document title and create a shortlist of the top 100 relevant documents (by title).

### Setup

In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'Cosine_Score','Query_Desc', 'Doc_Desc'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

Indexed queries and documents preprepared from previous notebook

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Titles.csv', 'r') as file:
    reader = csv.reader(file)
    documents = []
    documentIDs = []
    for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

In [None]:
# Calculate the average document length
total_doc_len = sum(len(doc) for doc in documents)
avg_doc_len = total_doc_len / len(documents)

### Preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

### Similarity calculation

In [None]:
def calculate_bm25(query, document, avg_doc_len, k1, b, N, df):
    query = preprocess_text(query)
    document = preprocess_text(document)
    score = 0
    for word in query:
        if word in df:
            tf = document.count(word)
            idf = log((N - df[word] + 0.5) / (df[word] + 0.5))
            score += idf * tf * (k1 + 1) / (tf + k1 * (1 - b + b * len(document) / avg_doc_len))
    return score

In [None]:
# Calculate the term frequency
df = {}
for doc in documents:
    doc = preprocess_text(doc)
    for word in set(doc):
        if word not in df:
            df[word] = 1
        else:
            df[word] += 1
N = len(documents)
# Scaling Parameters
k1 = 1.2
b = 0.75

### Process queries

For each query, a similarity score is computed for every document

In [None]:
# For each query
current_query = 0
for item in queries:
  
  query = ""
  query = (queries[current_query])
  queryID = queryIDs[current_query]
  
  bm25_scores = []
  bm25_scores = [(index, calculate_bm25(query, documents[index], avg_doc_len, k1, b, N, df)) for index in range(len(documents))]

  current_score = 0
  # For each computed similarity score
  for score in bm25_scores:
    #print("-- Query # " + queryID + ": " + query + " -- Score # " + str(current_score) + " " + str(score[1]) + " -- DOC: " + documents[current_score])
    new_row = [int(queryID), int(documentIDs[current_score]), score[1], query, documents[current_score]]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1

  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'BM25_Score'], ascending=[True, False])

In [None]:
# Restrict to top 100 results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults.insert(4, 'Rank',0)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
# Export final results to CSV for final analysis (outside of this notebook)
df_TopResults.to_csv("Export_BM25_Top100_by_Title.csv")

## Part 2 - Ranking by document contents
In this section we score each search query for document contents (main body of the document) and create a shortlist of the top 100 relevant documents (by contents).

### Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'BM25_Score'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

Indexed queries and documents preprepared from previous notebook

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Contents.csv', 'r') as file:
    reader = csv.reader(file)
    documents = []
    documentIDs = []
    for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries.csv', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

In [None]:
# Calculate the average document length
total_doc_len = sum(len(doc) for doc in documents)
avg_doc_len = total_doc_len / len(documents)

### Preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

### Similarity calculation

In [None]:
def calculate_bm25(query, document, avg_doc_len, k1, b, N, df):
    query = preprocess_text(query)
    document = preprocess_text(document)
    score = 0
    for word in query:
        if word in df:
            tf = document.count(word)
            idf = log((N - df[word] + 0.5) / (df[word] + 0.5))
            score += idf * tf * (k1 + 1) / (tf + k1 * (1 - b + b * len(document) / avg_doc_len))
    return score

In [None]:
# Calculate the term frequency
df = {}
for doc in documents:
    doc = preprocess_text(doc)
    for word in set(doc):
        if word not in df:
            df[word] = 1
        else:
            df[word] += 1

N = len(documents)
# Scaling Parameters
k1 = 1.2
b = 0.75

### Process queries

In [None]:
# For each query
current_query = 0
for item in queries:
  
  query = ""
  query = (queries[current_query])
  queryID = queryIDs[current_query]
  
  bm25_scores = []
  bm25_scores = [(index, calculate_bm25(query, documents[index], avg_doc_len, k1, b, N, df)) for index in range(len(documents))]

  current_score = 0
  # For each computed similarity score
  for score in bm25_scores:
    #print("-- Query # " + queryID + ": " + query + " -- Score # " + str(current_score) + " " + str(score[1]) + " -- DOC: " + documents[current_score])
    new_row = [int(queryID), int(documentIDs[current_score]), score[1]]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1

  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'BM25_Score'], ascending=[True, False])

In [None]:
# Restrict to top 100 results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
df_TopResults.to_csv("Export_BM25_Top100_by_Content.csv")