## **CONSTANTS**

In [24]:
FILE_PATH = "./data/simplewiki-latest-pages-articles-multistream.xml.bz2"
CSV_OUTPUT = "./data/simplewiki_articles.csv"
PROCESSED_TOKENS_OUTPUT = "./data/dataset_with_processed_tokens.jsonl"
INVERTED_INDEX_FILE = "./data/inverted_index.pkl"
INVERSE_DOCUMENT_FREQUENCY_FILE = "./data/inverse_document_frequency.pkl"
SQL_DATABASE_FILENAME = './db/wikipedia_snippets.db'

## **Utility Functions**

In [2]:
from pympler import asizeof
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
STEMMER = PorterStemmer()
STOP_WORDS = set(stopwords.words('english'))

In [None]:
def get_memory_consumed(obj):
    size_bytes = asizeof.asizeof(obj)
    size_mb = size_bytes / (1024 * 1024)
    print(f"{size_mb:.2f} MB")
    
    return size_mb

def preprocess_text(text) -> list[str]:
    
    # Step 1: Normalize the text to keep only alphanumeric text and single space instead of multiple spaces.
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 2: Tokenize the entire text
    tokens = word_tokenize(text)

    # Step 3: For each token -> filter out stopwords and tokens with only 1 character, lowercase, and stem to base form
    processed_tokens = [
        STEMMER.stem(token)
        for token in tokens
        if (token not in STOP_WORDS and len(token) > 1)
    ]
    
    # Step 4: Return the processed tokens
    return processed_tokens



    

## **TF-IDF Implementation**

In [26]:
import pickle
import sqlite3
from nltk.tokenize import word_tokenize
from collections import defaultdict


#### **Load the stored inverted index and inverse document frequency**

In [27]:
with open(INVERTED_INDEX_FILE, "rb") as f:
    inverted_index = pickle.load(f)

with open(INVERSE_DOCUMENT_FREQUENCY_FILE, "rb") as f:
    idf_data = pickle.load(f)
    
idf = idf_data['idf']
total_doc_count = idf_data['total_documents']


**Function to get PageIds from Inverted Indexes**

In [29]:
def rank_and_retrieve_pages_with_tfidf(query, top_k=20):

    query = query.lower()
    processed_tokens = preprocess_text(query)

    scores = defaultdict(float)

    for term in processed_tokens:
        if term not in inverted_index:
            continue

        postings = inverted_index[term] # -> {pageId: tf}
        term_idf = idf.get(term, 0.0)

        for page_id, tf in postings.items():
            scores[page_id] += tf * term_idf

    ranked_docs = sorted(
        scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    return [page_id for page_id, score in ranked_docs[:top_k]]

In [35]:
def search_query(query):
    # Step 1: Get ranked pageIds from TFâ€“IDF
    pageIds = rank_and_retrieve_pages_with_tfidf(query, top_k=20)
    if not pageIds:
        return []

    # Step 2: Preprocess query tokens for matching
    
    query_tokens = preprocess_text(query)

    # Step 3: Connect to DB and fetch metadata
    with sqlite3.connect(SQL_DATABASE_FILENAME) as conn:
        cursor = conn.cursor()
        placeholders = ','.join(['?'] * len(pageIds))
        sql = f"SELECT PageId, Title, Snippet, URL FROM articles WHERE PageId IN ({placeholders})"
        cursor.execute(sql, pageIds)
        results = cursor.fetchall()

    # Step 4: Build lookup dict for ordering
    results_dict = {row[0]: row for row in results}
    docs = [results_dict[pid] for pid in pageIds if pid in results_dict]

    # Step 5: Re-rank based on title match
    def title_match_score(doc):
        title = doc[1].lower()
        # Count query tokens that appear in the title
        return sum(1 for token in query_tokens if token in title)

    # Apply re-ranking: original order + title match boost
    docs.sort(key=lambda doc: (title_match_score(doc), pageIds.index(doc[0])), reverse=True)

    return docs


In [38]:
search_query("Deep learning models in healthcare")

[('37039',
  'Model',
  'Model, models, or modeling can mean: An abstract (idea, theory, simulation) or smaller approximation of an object or system for testing. (Like a car, building or ship.)\n\nIdeas, concepts or software\n Business model\n Model (abstract), an abstract or conceptual object used in the creation of math to predict its behavior\n Causal model\n Mathematical model\n Scientific model\n Model Driven Engineering, a software development technique based on abstract models\n Metamodeling, a model of a model\n Molecular',
  'https://simple.wikipedia.org/wiki/Model'),
 ('218',
  'Earth science',
  "thumb|250px|A volcano eruption is the release of stored energy from below the surface of Earth. The heat comes mostly from radioactive decay, and convection, in the Earth's core and mantle.Encyclopedia of Volcanoes. Academic Press, London, 2000.\n\nEarth science is an all-covering term for the sciences related to the planet Earth. Earth science may also be called geoscience. Geoscie

## **Free Memory**

In [39]:
import gc

del inverted_index
del idf_data
del idf

gc.collect()

274