## **CONSTANTS**

In [1]:
FILE_PATH = "./data/simplewiki-latest-pages-articles-multistream.xml.bz2"
CSV_OUTPUT = "./data/simplewiki_articles.csv"
PROCESSED_TOKENS_OUTPUT = "./data/dataset_with_processed_tokens.jsonl"
INVERTED_INDEX_FILE = "./data/inverted_index.pkl"
INVERSE_DOCUMENT_FREQUENCY_FILE = "./data/inverse_document_frequency.pkl"
SQL_DATABASE_FILENAME = './db/wikipedia_snippets.db'

## **Utility Functions**

In [2]:
from pympler import asizeof
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
STEMMER = PorterStemmer()
STOP_WORDS = set(stopwords.words('english'))

In [6]:
def get_memory_consumed(obj):
    size_bytes = asizeof.asizeof(obj)
    size_mb = size_bytes / (1024 * 1024)
    print(f"{size_mb:.2f} MB")
    
    return size_mb

def preprocess_text(text) -> list[str]:
    
    # Step 1: Normalize the text to keep only alphanumeric text and single space instead of multiple spaces.
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 2: Tokenize the entire text
    tokens = word_tokenize(text)

    # Step 3: For each token -> filter out stopwords and tokens with only 1 character, lowercase, and stem to base form
    processed_tokens = [
        STEMMER.stem(token)
        for token in tokens
        if (token not in STOP_WORDS and len(token) > 1)
    ]
    
    # Step 4: Return the processed tokens
    return processed_tokens



    

## **BM25 Implementation**

In [68]:
import pickle
import sqlite3
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
MODEL = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 312.90it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


#### **Load the stored inverted index and inverse document frequency**

In [9]:
with open(INVERTED_INDEX_FILE, "rb") as f:
    inverted_index = pickle.load(f)

with open(INVERSE_DOCUMENT_FREQUENCY_FILE, "rb") as f:
    idf_data = pickle.load(f)
    
idf = idf_data['idf']
total_doc_count = idf_data['total_documents']
doc_lengths = idf_data["doc_lengths"]
avg_doc_length = idf_data["avg_doc_length"]


**Function to get PageIds from Inverted Indexes**

In [10]:
def rank_and_retrieve_pages_with_bm25(query, top_k=20, k1=1.5, b=0.75):

    query = query.lower()
    processed_tokens = preprocess_text(query)

    scores = defaultdict(float)

    for term in processed_tokens:
        if term not in inverted_index:
            continue

        postings = inverted_index[term]     # {page_id: tf}
        term_idf = idf.get(term, 0.0)

        for page_id, tf in postings.items():
            dl = doc_lengths.get(page_id, 0)

            denom = tf + k1 * (1 - b + b * (dl / avg_doc_length))
            bm25_tf = (tf * (k1 + 1)) / denom if denom != 0 else 0

            scores[page_id] += term_idf * bm25_tf

    ranked_docs = sorted(
        scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    return [page_id for page_id, _ in ranked_docs[:top_k]]


In [None]:
def search_query(query):
    
    # Step 1: First Stage Ranking: Get the BM25 Ranked Candidate document IDs from inverted index.
    pageIds = rank_and_retrieve_pages_with_bm25(query, top_k=20)
    if not pageIds:
        return []

    # Step 2: Fetch Candidate Document Data from Database
    with sqlite3.connect(SQL_DATABASE_FILENAME) as conn:
        cursor = conn.cursor()
        placeholders = ','.join(['?'] * len(pageIds))
        sql = f"SELECT PageId, Title, Snippet, URL FROM articles WHERE PageId IN ({placeholders})"
        cursor.execute(sql, pageIds)
        results = cursor.fetchall()

    # Step 3: Re-arrange the fetched documents in order of the BM25 ranked PageIds. Because SQLite returns data in random order
    results_dict = {}
    docs = []
    
    for page_data in results:
        results_dict[page_data[0]] = page_data # -> {"PageId" : Page Data}
    
    for pageId in pageIds:
        if pageId in results_dict:
            docs.append(results_dict.get(pageId)) # Append the page data in order of the BM25 ranked pageIds
    
    if not docs:
        return []
    
    # Step 4: Second Stage Re-Ranking: Match the semantic similarity of the title with query.
    doc_texts = []
    for page_data in docs:
        doc_texts.append(f"{page_data[1]}. {page_data[2]}") # page_data[1] is Title and page_data[2] is snippet.
    
    query_vector = MODEL.encode([query])
    doc_vectors = MODEL.encode(doc_texts)
    
    semantic_scores =  cosine_similarity(query_vector, doc_vectors)[0]
    
    # Step 5: Rerank by semantic similarity
    reranked = sorted(
        zip(docs, semantic_scores),
        key=lambda doc: doc[1], # -> Rank by semantic score which will be at index 1
        reverse=True # High score to Low score
    )

    return [doc for doc, score in reranked]
    


In [94]:
search_query("What is an apple fruit?")

[('39',
  'Apple',
  'thumb|Granny Smith green apples\n\nAn apple is a sweet, edible fruit that is usually red or green. The tree (Malus spp.) is grown worldwide. The fruit is low-cost, popular, and common all over the earth & taste is fruity. \n\nThe apple tree comes from southern Kazakhstan; Kyrgyzstan; Uzbekistan; Turkey; and northwestern part of China. Apples have been grown for thousands of years in Asia and in European continent. They were brought to North America by European World Colonial settlers. Apples have Re',
  'https://simple.wikipedia.org/wiki/Apple'),
 ('46074',
  'Pond-apple',
  'A pond-apple is a type of fruit. It is not related to the apple. They usually live near or in water.\n\nCategory:Annona\nCategory:Fruits',
  'https://simple.wikipedia.org/wiki/Pond-apple'),
 ('85724',
  "Adam's apple (disambiguation)",
  'An Adam\'s apple is a structure in the front of the throat more prominent in men than women.\n\nAdam\'s apple can also mean:\n\nAdam\'s Apples, a 2005 Danis

## **Free Memory**

In [20]:
import gc

del idf
del idf_data
del inverted_index
del total_doc_count
del doc_lengths
del avg_doc_length

gc.collect()

79