In [84]:
import requests
from bs4 import BeautifulSoup

# Target Wikipedia URL
url = "https://en.wikipedia.org/wiki/Bangladesh"

# Add headers to avoid being blocked
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the page
response = requests.get(url, headers=headers)
print("Status Code:", response.status_code)
response.raise_for_status()

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Extract all paragraphs
paragraphs = soup.find_all("p")

# Join into one string
text = " ".join(p.get_text(strip=True) for p in paragraphs)

# Save to file
with open("bangladesh_wikipedia.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("Data saved to bangladesh_wikipedia.txt")


Status Code: 200
Data saved to bangladesh_wikipedia.txt


In [85]:
import re

with open("bangladesh_wikipedia.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

def clean_text(text):
    text = re.sub(r'\[\d+\]', '', text)  # remove [1], [23], etc.
    text = re.sub(r'\s+', ' ', text)     # normalize whitespace
    text = text.replace('\xa0', ' ').strip()
    return text

cleaned_text = clean_text(raw_text)

with open("bangladesh_wikipedia_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Cleaned text saved to bangladesh_wikipedia_cleaned.txt")


Cleaned text saved to bangladesh_wikipedia_cleaned.txt


In [86]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(cleaned_text)
print(f"Created {len(chunks)} chunks")


Created 19 chunks


In [87]:
import uuid
from pinecone import Pinecone

In [None]:

# Connect to Pinecone
pc = Pinecone(api_key="")

# Use the index you already created
index = pc.Index("bangladesh-wiki")
print("Connected to index: bangladesh-wiki")

Connected to index: bangladesh-wiki


In [89]:
# Generate embeddings for your chunks
embed_model = "llama-text-embed-v2"

res = pc.inference.embed(
    model=embed_model,
    inputs=chunks,   # list of text chunks
    parameters={"input_type": "passage"}  # passage type for document text
)

embeddings = [r.values for r in res.data]
print(f"Created {len(embeddings)} embeddings")

Created 19 embeddings


In [91]:
vectors = [
    (
        str(uuid.uuid4()),  # unique ID
        emb,                # embedding vector
        {
            "text": chunk,                     # original text
            "source": "Wikipedia - Bangladesh",# where it came from
            "chunk_id": i+1,                   # chunk number
            "length": len(chunk)               # character length
        }
    )
    for i, (emb, chunk) in enumerate(zip(embeddings, chunks))
]

In [92]:
# Upload to Pinecone
index.upsert(vectors=vectors)
print(f"Uploaded {len(vectors)} vectors with embeddings + metadata into 'bangladesh-wiki'")

Uploaded 19 vectors with embeddings + metadata into 'bangladesh-wiki'


In [93]:
query = "Tell me about the history of Bangladesh."

q_res = pc.inference.embed(
    model=embed_model,
    inputs=[query],
    parameters={"input_type": "query"}
)
q_vec = q_res.data[0].values

results = index.query(vector=q_vec, top_k=3, include_metadata=True)

print("\n🔎 Search Results:")
for match in results["matches"]:
    print("Score:", round(match["score"], 3))
    print("Text:", match["metadata"]["text"])
    print("----")


🔎 Search Results:
Score: 0.641
Text: Bangladesh,[a]officially thePeople's Republic of Bangladesh,[b]is a country inSouth Asia. It is theeighth-most populous countryin the world and among themost densely populatedwith a population of over 171 million within an area of 148,460 square kilometres (57,320 sq mi). Bangladesh shares land borders withIndiato the north, west, and east, andMyanmarto the southeast. It has a coastline along theBay of Bengalto its south and is separated fromBhutanandNepalby theSiliguri Corridor, and from China by theIndian stateofSikkimto its north.Dhaka, the capital andlargest city, is the nation's political, financial, and cultural centre.Chittagongis the second-largest city and the busiest port of the country. The territory of modern Bangladesh was a stronghold of manyBuddhistandHindu dynastiesin ancient history. Following theMuslim conquestin 1204, the region sawSultanateandMughalrule. During theMughalperiod, particularly under theBengal Subah, the region emer