**Necessary Libraries**

In [None]:
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-ben
!pip install gdown pytesseract pdf2image faiss-cpu sentence-transformers transformers gradio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils tesseract-ocr-ben
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 702 kB of archives.
After this operation, 1,568 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ben all 1:4.00~git30-7274cfa-1.1 [516 kB]
Fetched 702 kB in 1s (1,061 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Selecting previously unselected package tesseract-ocr-ben.
Preparing to unpack .../t

In [None]:
import gdown
from pdf2image import convert_from_path
import pytesseract
import unicodedata
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import pipeline
import gradio as gr

**Load Dataset**

In [None]:
file_id = "1YVPzZgXp6nyklR6SfNmLtV8tZJXqJtMH"
file_url = f"https://drive.google.com/uc?id={file_id}"
output = "HSC26-Bangla1st-Paper.pdf"
gdown.download(file_url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1YVPzZgXp6nyklR6SfNmLtV8tZJXqJtMH
To: /content/HSC26-Bangla1st-Paper.pdf
100%|██████████| 1.26M/1.26M [00:00<00:00, 13.4MB/s]


'HSC26-Bangla1st-Paper.pdf'

**OCR from PDF**

In [None]:
images = convert_from_path(output)
text = ""
for i, img in enumerate(images):
    print(f"OCR on page {i+1}/{len(images)}...")
    text += pytesseract.image_to_string(img, lang='ben') + "\n\n"

OCR on page 1/49...
OCR on page 2/49...
OCR on page 3/49...
OCR on page 4/49...
OCR on page 5/49...
OCR on page 6/49...
OCR on page 7/49...
OCR on page 8/49...
OCR on page 9/49...
OCR on page 10/49...
OCR on page 11/49...
OCR on page 12/49...
OCR on page 13/49...
OCR on page 14/49...
OCR on page 15/49...
OCR on page 16/49...
OCR on page 17/49...
OCR on page 18/49...
OCR on page 19/49...
OCR on page 20/49...
OCR on page 21/49...
OCR on page 22/49...
OCR on page 23/49...
OCR on page 24/49...
OCR on page 25/49...
OCR on page 26/49...
OCR on page 27/49...
OCR on page 28/49...
OCR on page 29/49...
OCR on page 30/49...
OCR on page 31/49...
OCR on page 32/49...
OCR on page 33/49...
OCR on page 34/49...
OCR on page 35/49...
OCR on page 36/49...
OCR on page 37/49...
OCR on page 38/49...
OCR on page 39/49...
OCR on page 40/49...
OCR on page 41/49...
OCR on page 42/49...
OCR on page 43/49...
OCR on page 44/49...
OCR on page 45/49...
OCR on page 46/49...
OCR on page 47/49...
OCR on page 48/49...
O

**Clean Text**

In [None]:
def clean_bangla_text(text):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[^\S]+", " ", text)
    text = re.sub(r"[\u200c\u200d]", "", text)
    return text.strip()

text = clean_bangla_text(text)

**Extract MCQ Answer**

In [None]:
def extract_mcq_answer_mapping(raw_text):
    page_41 = raw_text.split("৪১।")[1] if "৪১।" in raw_text else raw_text
    matches = re.findall(r"(\d+)\s+([কখগঘ])", page_41)
    return {int(num): letter for num, letter in matches}

mcq_answers = extract_mcq_answer_mapping(text)

**Chunk Text**

In [None]:
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
chunk_size = 800
chunk_overlap = 100

chunks = []
for para in paragraphs:
    for i in range(0, len(para), chunk_size - chunk_overlap):
        chunks.append(para[i:i + chunk_size])

**Embedding**

In [None]:
embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedder.encode(chunks, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

**Retrieve**

In [None]:
def retrieve_chunks(query, k=10):
    query_vec = embedder.encode([query]).astype("float32")
    D, I = index.search(query_vec, k)
    return [chunks[i] for i in I[0]]

**QA Model**

In [None]:
qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")

config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


**Mapping Bangla Letter to Option**

In [None]:
bangla_option_map = {
    "ক": 0, "খ": 1, "গ": 2, "ঘ": 3
}

**QA Answer Generator with MCQ support**

In [None]:
def generate_answer(query):
    context = "\n\n".join(retrieve_chunks(query, k=10))
    result = qa_pipeline(question=query, context=context)
    answer = result["answer"]

    match = re.search(r"(\d+)।", query)
    if match:
        q_no = int(match.group(1))
        option_letter = mcq_answers.get(q_no)
        if option_letter:
            pattern = rf"{q_no}।(.+?)(?=\d+।|$)"
            mcq_block = re.search(pattern, text, re.DOTALL)
            if mcq_block:
                options = re.findall(r"\((ক|খ|গ|ঘ)\)\s*([^\(\)\n\r]+)", mcq_block.group(1))
                for opt, val in options:
                    if opt == option_letter:
                        return val.strip()
    return answer

**Live Demo App**

In [None]:
def qa_interface(query):
    return generate_answer(query)

gr.Interface(fn=qa_interface,
             inputs=gr.Textbox(label="প্রশ্ন লিখুন (বাংলা বা English)"),
             outputs=gr.Textbox(label="উত্তর"),
             title="10MS RAG Pipeline").launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dd53fcb18de85b71a3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


