# Problems 2 - wikipedia_index_prime_ministers_llamaindex


Q2A â€” Index Wikipedia pages of all Prime Ministers of India with LlamaIndex

In [18]:
# !pip install requests beautifulsoup4 llama-index openai tiktoken

In [19]:
# imports and constants
import os
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")


In [20]:
# List of prime ministers to download (includes Gulzarilal Nanda as acting PM)
PM_NAMES = [
    "Jawaharlal_Nehru",
    "Lal_Bahadur_Shastri",
    "Gulzarilal_Nanda",
    "Indira_Gandhi",
    "Morarji_Desai",
    "Charan_Singh",
    "Rajiv_Gandhi",
    "V._P._Singh",
    "Chandra_Shekhar",
    "P._V._Narasimha_Rao",
    "Atal_Bihari_Vajpayee",
    "H._D._Deve_Gowda",
    "Inder_Kumar_Gujral",
    "Manmohan_Singh",
    "Narendra_Modi",
]

WIKI_BASE = "https://en.wikipedia.org/wiki/"
OUT_DIR = Path("wikipedia_pm_pages")
OUT_DIR.mkdir(exist_ok=True)

In [21]:
def download_and_extract_wiki(name):
    url = WIKI_BASE + name
    print(f"Downloading {url}")
    # Add a User-Agent header to mimic a browser, which can help bypass 403 errors
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, timeout=30, headers=headers)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    # Remove tables of contents, navboxes, references, and scripts/styles
    for el in soup(["script", "style", "aside", "footer", "nav", "sup", "table", "style"]):
        el.decompose()
    content = soup.find("div", {"class": "mw-parser-output"})
    if content is None:
        # fallback: grab the whole text
        text = soup.get_text(separator="\n")
        return text
    # Collect paragraph text under main content
    paras = []
    for p in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5']):
        txt = p.get_text(strip=True)
        if txt:
            paras.append(txt)
    return "\n\n".join(paras)

downloaded = {}
for name in PM_NAMES:
    try:
        text = download_and_extract_wiki(name)
        fn = OUT_DIR / f"{name}.txt"
        fn.write_text(text, encoding='utf-8')
        downloaded[name] = str(fn)
    except Exception as e:
        print(f"Failed to download {name}: {e}")
print("Downloaded pages saved to", OUT_DIR)

Downloading https://en.wikipedia.org/wiki/Jawaharlal_Nehru
Downloading https://en.wikipedia.org/wiki/Lal_Bahadur_Shastri
Downloading https://en.wikipedia.org/wiki/Gulzarilal_Nanda
Downloading https://en.wikipedia.org/wiki/Indira_Gandhi
Downloading https://en.wikipedia.org/wiki/Morarji_Desai
Downloading https://en.wikipedia.org/wiki/Charan_Singh
Downloading https://en.wikipedia.org/wiki/Rajiv_Gandhi
Downloading https://en.wikipedia.org/wiki/V._P._Singh
Downloading https://en.wikipedia.org/wiki/Chandra_Shekhar
Downloading https://en.wikipedia.org/wiki/P._V._Narasimha_Rao
Downloading https://en.wikipedia.org/wiki/Atal_Bihari_Vajpayee
Downloading https://en.wikipedia.org/wiki/H._D._Deve_Gowda
Downloading https://en.wikipedia.org/wiki/Inder_Kumar_Gujral
Downloading https://en.wikipedia.org/wiki/Manmohan_Singh
Downloading https://en.wikipedia.org/wiki/Narendra_Modi
Downloaded pages saved to wikipedia_pm_pages


In [22]:
# Indexing with LlamaIndex and counting chunks
# NOTE: llama-index API can vary by version. The snippet below follows modern llama-index patterns.
# If API differences arise, adapt to your installed llama-index version.

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import json

# --- Configure embedding model (OpenAI text-embedding-3-large) ---
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

# --- Configure LLM (llama-3.1-8b-instant) ---
# How to plug in llama-3.1-8b-instant depends on your LLM provider wrapper.
# Example (pseudo-code) if using an OpenAI-compatible provider:
# from llama_index.llms import OpenAI
# llm = OpenAI(model_name="llama-3.1-8b-instant", temperature=0)
#
# If you have a provider wrapper for 'llama' (for example, MosaicML or other),
# replace the LLMPredictor below accordingly.

# For safety, we set llm to None here; you should configure your LLM wrapper in your environment.
llm = None

# Configure global settings for LlamaIndex
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 1024      # token chunk size (default in docs)
Settings.chunk_overlap = 20     # token overlap (default in docs)

# Read downloaded files into LlamaIndex Documents
docs = []
for name, path in downloaded.items():
    text = Path(path).read_text(encoding='utf-8')
    doc = Document(text_content=text) # Create Document with only the text content
    doc.metadata = {"source": name} # Assign metadata separately
    docs.append(doc)

# Use TokenTextSplitter (token-based) for splitting into nodes/chunks
token_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=20)
all_nodes = []
for doc in docs:
    nodes = token_splitter.split_text(doc.get_content())
    # split_text returns list of dicts or nodes depending on version; adapt if necessary
    # We'll normalize to text chunks
    if isinstance(nodes, list) and len(nodes) and isinstance(nodes[0], dict) and 'text' in nodes[0]:
        chunk_texts = [n['text'] for n in nodes]
    else:
        # assume list of strings
        chunk_texts = nodes
    all_nodes.extend(chunk_texts)

print(f"Total chunks (by token splitter with chunk_size=1024, overlap=20): {len(all_nodes)}")

# --- Build index (optional) ---
# If you have LLM configured, you can build a vector index:
# index = GPTVectorStoreIndex.from_documents(docs, service_context=service_context)
# index.storage_context.persist(persist_dir='q2a_index')
#
# Save a small summary JSON with counts:
summary = {
    "num_documents": len(docs),
    "num_chunks": len(all_nodes),
    "chunk_size_tokens": 1024,
    "chunk_overlap_tokens": 20,
    "pm_list": list(downloaded.keys())
}
Path('q2a_index_summary.json').write_text(json.dumps(summary, indent=2), encoding='utf-8')
print('Wrote q2a_index_summary.json')

LLM is explicitly disabled. Using MockLLM.
Total chunks (by token splitter with chunk_size=1024, overlap=20): 15
Wrote q2a_index_summary.json
