In [1]:
# run once
!pip install -q langchain langchain-community langchain-text-splitters langchain-openai \
chromadb sentence-transformers pypdf python-docx datasets huggingface_hub \
transformers accelerate torch streamlit pyngrok ipywidgets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m20.4 MB/s[0m eta [36m0:00:

In [20]:
# 1. Install required libraries
!pip install -q langchain langchain-community langchain-huggingface chromadb sentence-transformers datasets transformers accelerate bitsandbytes tqdm

# 2. Imports
from datasets import load_dataset
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline
import itertools
import tqdm

# ---------------- CONFIG ----------------
PERSIST_DIR = "/content/chroma_db"
DATASET_NAME = "wikimedia/wikipedia"
DATASET_CONFIG = "20231101.en"
SAMPLE_LIMIT = 1000
BATCH_SIZE = 100
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
MAX_CHROMA_BATCH = 5000

# 3. Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Chroma DB
db = Chroma(persist_directory=PERSIST_DIR, embedding_function=embeddings)

# 5. Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# 6. Load dataset (streaming)
stream = load_dataset(DATASET_NAME, DATASET_CONFIG, split="train", streaming=True)

batch_docs = []
count = 0
batch_num = 0

# ---------------- LOAD & STORE ----------------
for example in tqdm.tqdm(itertools.islice(stream, SAMPLE_LIMIT), total=SAMPLE_LIMIT):
    text = example.get("text", "") or ""
    title = example.get("title", "") or ""
    source = example.get("url", f"{DATASET_NAME}:{example.get('id','-')}")
    full_text = (title + "\n\n" + text).strip()
    if not full_text:
        continue

    doc = Document(page_content=full_text, metadata={"source": source, "title": title, "id": example.get("id")})
    batch_docs.append(doc)
    count += 1

    if len(batch_docs) >= BATCH_SIZE:
        chunks = splitter.split_documents(batch_docs)
        for i in range(0, len(chunks), MAX_CHROMA_BATCH):
            small_batch = chunks[i:i+MAX_CHROMA_BATCH]
            db.add_documents(small_batch)
        db.persist()
        batch_num += 1
        print(f"✅ Persisted batch {batch_num} — total articles processed: {count}")
        batch_docs = []

if batch_docs:
    chunks = splitter.split_documents(batch_docs)
    for i in range(0, len(chunks), MAX_CHROMA_BATCH):
        small_batch = chunks[i:i+MAX_CHROMA_BATCH]
        db.add_documents(small_batch)
    db.persist()
    print(f"✅ Final persisted. Total articles processed: {count}")

# ---------------- FREE LLM (Hugging Face Pipeline) ----------------
# 7. Load a free model from Hugging Face
model_name = "tiiuae/falcon-7b-instruct"  # or "mistralai/Mistral-7B-Instruct-v0.2"
hf_pipeline = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype="auto",
    device_map="auto",
    max_new_tokens=512,
    temperature=0.3
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

# 8. Build RAG Chain
retriever = db.as_retriever(search_kwargs={"k": 3})
chat_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

print("✅ Chatbot ready! Type 'exit' to quit.\n")
while True:
    q = input("You: ").strip()
    if q.lower() in ("exit", "quit"):
        break
    out = chat_chain.invoke({"query": q})
    print("\nBot:\n", out.get("result", "No answer"))
    sdocs = out.get("source_documents", [])
    if sdocs:
        print("\nSources:")
        for i, sd in enumerate(sdocs, 1):
            print(i, sd.metadata.get("source", "-"), "| title:", sd.metadata.get("title", "-"))
    print("\n---\n")


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

 10%|█         | 100/1000 [00:25<03:47,  3.96it/s]

✅ Persisted batch 1 — total articles processed: 100


 20%|██        | 200/1000 [00:42<02:41,  4.94it/s]

✅ Persisted batch 2 — total articles processed: 200


 30%|███       | 300/1000 [00:54<01:55,  6.07it/s]

✅ Persisted batch 3 — total articles processed: 300


 40%|████      | 400/1000 [01:06<01:28,  6.76it/s]

✅ Persisted batch 4 — total articles processed: 400


 50%|█████     | 500/1000 [01:15<01:04,  7.76it/s]

✅ Persisted batch 5 — total articles processed: 500


 60%|██████    | 600/1000 [01:24<00:45,  8.82it/s]

✅ Persisted batch 6 — total articles processed: 600


 70%|███████   | 700/1000 [01:34<00:32,  9.22it/s]

✅ Persisted batch 7 — total articles processed: 700


 80%|████████  | 800/1000 [01:47<00:23,  8.47it/s]

✅ Persisted batch 8 — total articles processed: 800


 90%|█████████ | 900/1000 [02:00<00:12,  8.22it/s]

✅ Persisted batch 9 — total articles processed: 900


100%|██████████| 1000/1000 [02:18<00:00,  7.20it/s]

✅ Persisted batch 10 — total articles processed: 1000





config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Chatbot ready! Type 'exit' to quit.

You: what is the capital of Pakistan?


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Bot:
 Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Autonomous Region of China; and the Gilgit-Baltistan territory, Khyber Pakhtunkhwa province and Balochistan province of Pakistan.

Autonomous Region of China; and the Gilgit-Baltistan territory, Khyber Pakhtunkhwa province and Balochistan province of Pakistan.

Afghanistan

Afghanistan, officially the Islamic Emirate of Afghanistan, is a landlocked country located at the crossroads of Central Asia and South Asia. Referred to as the Heart of Asia, it is bordered by Pakistan to the east and south, Iran to the west, Turkmenistan to the northwest, Uzbekistan to the north, Tajikistan to the northeast, and China to the northeast and east. Occupying  of land, the country is predominantly mountainous with plains in the north and the southwest, which are separated by the Hindu Kush mountain range. Kabul is the country's larges