## **CONSTANTS**

In [2]:
FILE_PATH = "./data/simplewiki-latest-pages-articles-multistream.xml.bz2"
CSV_OUTPUT = "./data/simplewiki_articles.csv"
PROCESSED_TOKENS_OUTPUT = "./data/dataset_with_processed_tokens.jsonl"
INVERTED_INDEX_FILE = "./data/inverted_index.pkl"
INVERSE_DOCUMENT_FREQUENCY_FILE = "./data/inverse_document_frequency.pkl"
SQL_DATABASE_FILENAME = './db/wikipedia_snippets.db'

## **Utility Functions**

In [3]:
from pympler import asizeof
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baigj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
STEMMER = PorterStemmer()
STOP_WORDS = set(stopwords.words('english'))

In [6]:
def get_memory_consumed(obj):
    size_bytes = asizeof.asizeof(obj)
    size_mb = size_bytes / (1024 * 1024)
    print(f"{size_mb:.2f} MB")
    
    return size_mb

def generate_url(title):
    return f"https://simple.wikipedia.org/wiki/{title.replace(" ", "_")}"

def preprocess_text(text) -> list[str]:
    
    # Step 1: Normalize the text to keep only alphanumeric text and single space instead of multiple spaces.
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 2: Tokenize the entire text
    tokens = word_tokenize(text)

    # Step 3: For each token -> filter out stopwords and tokens with only 1 character, lowercase, and stem to base form
    processed_tokens = [
        STEMMER.stem(token)
        for token in tokens
        if (token not in STOP_WORDS and len(token) > 1)
    ]
    
    # Step 4: Return the processed tokens
    return processed_tokens


    

## **Read and Parse the XML content and save into csv file**

In [2]:
import bz2
import mwxml
import mwparserfromhell as mwp
import csv

In [5]:

with open(CSV_OUTPUT, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['PageId', 'Title', 'Text'])

    with bz2.open(FILE_PATH, 'rt', encoding='utf-8') as f:
        dump = mwxml.Dump.from_file(f)

        count = 0
        for page in dump:
            
            if page.namespace != 0 or page.redirect:
                continue

            for revision in page:
                if revision.text:
                    try:
                        wikicode = mwp.parse(revision.text)
                        plain_text = wikicode.strip_code().strip()

                        if plain_text:
                            writer.writerow([page.id, page.title, plain_text])
                            count += 1
                            if count % 100 == 0:
                                print(f"Saved {count} articles...", end='\r')
                    except Exception as e:
                        print(f"Error parsing page {page.title}: {e}")

                break 

print(f"\nDone! Saved {count} articles to '{CSV_OUTPUT}'.")


Saved 278900 articles...
Done! Saved 278950 articles to 'simplewiki_articles.csv'.


## **Text Normalization and Tokenization**

In [8]:
import pandas as pd
import sqlite3
import gc

##### **Loading CSV that contains parsed articles aata**

In [20]:
dataset = pd.read_csv(CSV_OUTPUT)

dataset = dataset[["PageId", "Title", "Text"]]
dataset.dropna(inplace=True)
dataset.drop_duplicates(subset=["PageId"], inplace=True)

dataset.head()

  dataset = pd.read_csv(CSV_OUTPUT)


Unnamed: 0,PageId,Title,Text
0,1,April,April (Apr.) is the fourth month of the year i...
1,2,August,August (Aug.) is the eighth month of the year ...
2,6,Art,thumb|300x300px|A painting by Renoir is a work...
3,8,A,"thumb|Writing ""A"" in cursive font.\n\nA is the..."
4,9,Air,thumb|A fan moves air.\n\nAir is the Earth's a...


**Save this data into a SQLite databse. It will be used in query time**

In [21]:
# Step 1: Extract a short snippet of the original text and create a URL to link back to original source of article
dataset['Snippet'] = dataset['Text'].str[:500]
dataset['URL'] = dataset['Title'].apply(generate_url)

# Step 2: Extract the columns that are required to be stored in the SQLite database
dataset_to_store = dataset[['PageId', 'Title', 'Snippet', "URL"]]
dataset_to_store.head(3)

Unnamed: 0,PageId,Title,Snippet,URL
0,1,April,April (Apr.) is the fourth month of the year i...,https://simple.wikipedia.org/wiki/April
1,2,August,August (Aug.) is the eighth month of the year ...,https://simple.wikipedia.org/wiki/August
2,6,Art,thumb|300x300px|A painting by Renoir is a work...,https://simple.wikipedia.org/wiki/Art


In [22]:
# Step 3: Create a connection with the SQLite database
conn = sqlite3.connect(SQL_DATABASE_FILENAME)
cursor = conn.cursor()

# Step 4: Create an empty table named articles
cursor.execute('''
    CREATE TABLE IF NOT EXISTS articles (
        PageId INTEGER PRIMARY KEY,
        Title TEXT,
        Snippet TEXT,
        URL TEXT
    )
''')
# Step 4: Save the data into `articles` table
dataset_to_store.to_sql('articles', conn, if_exists='replace', index=False)

# Step 5: Commit the changes and close the connection
conn.commit()
conn.close()

# Step 6: After this, dataset_to_store will be consuming almost 258.33 MB of memory useless. Free it.
del dataset_to_store
gc.collect()

214

##### **Apply proprocessing on text and save it in a jsonl file for safe future use**

In [40]:
import json
from tqdm import tqdm

In [41]:

with open(PROCESSED_TOKENS_OUTPUT, "w", encoding="utf-8") as file:

    for row in tqdm(dataset.itertuples(), total=len(dataset), desc="Processing articles"):
        
        # Filter invalid PageIds
        if str(row.PageId).isnumeric():
            record = {
                "PageId": row.PageId,
                "Processed_Tokens": preprocess_text(row.Text),
            }

            file.write(json.dumps(record, ensure_ascii=False) + "\n")


Processing articles: 100%|██████████| 279461/279461 [18:36<00:00, 250.23it/s]


##### **Free Non required memory**

After this step, we don't need the actual dataset DataFrame. We have stored its contents in a faster SQLite database and stored the processed tokens in the jsonl file for future processing if required.

Memory consumed by dataset variable = >800 MB

In [23]:
del dataset
gc.collect()

0

## **Creation of Inverted Indexes**

In [7]:
import json
import pickle
import math
import gc
import numpy as np
from tqdm import tqdm
from collections import defaultdict, Counter

In [9]:
inverted_index = defaultdict(dict)
total_doc_count = 0  
idf = {}

# For BM25 implementation
doc_lengths = {}
avg_doc_length = 0


# Count total lines once (for proper progress bar)
with open(PROCESSED_TOKENS_OUTPUT, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)


with open(PROCESSED_TOKENS_OUTPUT, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=total_lines, desc="Building inverted index and collecting metadata for inverse document frequency."):
        data = json.loads(line)
        page_id = data["PageId"]
        tokens = data["Processed_Tokens"]

        total_doc_count += 1
        
        doc_length = len(tokens)
        doc_lengths[page_id] = doc_length
        avg_doc_length += doc_length

        term_counts = Counter(tokens)

        for term, tf in term_counts.items():
            inverted_index[term][page_id] = tf
            
avg_doc_length = avg_doc_length / total_doc_count


for term, postings in tqdm(
        inverted_index.items(),
        total=len(inverted_index),
        desc="Computing IDF"
    ):
    
    document_frequency = len(postings)

    idf[term] = np.float32(
        math.log((total_doc_count + 1) / (document_frequency + 1)) + 1
    )


with open(INVERTED_INDEX_FILE, "wb") as f:
    pickle.dump(inverted_index, f, protocol=pickle.HIGHEST_PROTOCOL)


with open(INVERSE_DOCUMENT_FREQUENCY_FILE, "wb") as f:
    pickle.dump(
        {
            "total_documents": total_doc_count,
            "doc_lengths": doc_lengths,
            "avg_doc_length": avg_doc_length,
            "idf": idf
        },
        f,
        protocol=pickle.HIGHEST_PROTOCOL
    )


Building inverted index and collecting metadata for inverse document frequency.: 100%|██████████| 278941/278941 [00:28<00:00, 9829.28it/s] 
Computing IDF: 100%|██████████| 877202/877202 [00:02<00:00, 335595.92it/s]


##### **Free Non required memory**

After this step, we don't need the actual inverted_index, total_doc_count, and idf variable.

Memory consumed by inverted_index variable = >1 GB

Memory consumed by total_doc_count variable = 0 MB

Memory consumed by idf variable = 99.54 MB

Memory consumed by avg_doc_length variable = 0 MB

Memory consumed by doc_lengths variable = 19.87 MB

In [None]:
del inverted_index
del total_doc_count
del doc_lengths
del avg_doc_length
del idf

gc.collect()

0

After this Step, Weh have successfully parsed the XML, Preprocessed the text, and created an efficient inverted index along with term frequencies