In [1]:
# Install required packages for Bengali RAG system with OCR and Translation
# Run this cell first to install all necessary packages
# !pip install langchain-openai
# !pip install langchain-community
# !pip install chromadb
# !pip install pymupdf  # For PDF to image conversion
# !pip install pytesseract  # OCR engine
# !pip install Pillow  # Image processing
# !pip install opencv-python  # Image preprocessing
# !pip install numpy
# !pip install deep-translator  # Stable Google Translate API
# !pip install langdetect  # Language detection

# Note: You also need to install Tesseract OCR separately:
# Download from: https://github.com/UB-Mannheim/tesseract/wiki
# Make sure to install Bengali language data (ben.traineddata)

In [2]:
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.schema import Document
import sys
import os
import re
import fitz  # PyMuPDF for PDF to image
import pytesseract
from PIL import Image
import cv2
import numpy as np
from typing import List, Optional, Tuple
from deep_translator import GoogleTranslator
from langdetect import detect
import time
import json

In [None]:
# Set up OpenAI API key
# You can get your API key from: https://platform.openai.com/api-keys
openai_api_key = "KEY"
os.environ["OPENAI_API_KEY"] = openai_api_key

In [4]:
# Configure Tesseract OCR
# Make sure you have installed Tesseract and Bengali language data
# Download Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
# Bengali language data should be in tessdata folder

# Set Tesseract path (adjust according to your installation)
# For Windows: 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# For Linux/Mac, it's usually in PATH, so you might not need to set this
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Test if Bengali is available
try:
    available_langs = pytesseract.get_languages()
    if 'ben' in available_langs:
        print("тЬЕ Bengali language support is available in Tesseract")
    else:
        print("тЭМ Bengali language support not found. Please install Bengali traineddata.")
        print("Download ben.traineddata from: https://github.com/tesseract-ocr/tessdata")
except Exception as e:
    print(f"тЪая╕П Tesseract configuration issue: {e}")
    print("Please make sure Tesseract is properly installed and configured.")

тЬЕ Bengali language support is available in Tesseract


In [5]:
def preprocess_image_for_ocr(image_array: np.ndarray) -> np.ndarray:
    """
    Preprocess image for better OCR results
    """
    # Convert to grayscale if needed
    if len(image_array.shape) == 3:
        gray = cv2.cvtColor(image_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = image_array
    
    # Increase contrast and brightness
    alpha = 1.2  # Contrast control
    beta = 10    # Brightness control
    adjusted = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)
    
    # Apply Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(adjusted, (1, 1), 0)
    
    # Apply threshold to get binary image
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Morphological operations to clean up the image
    kernel = np.ones((1, 1), np.uint8)
    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    
    return cleaned

def pdf_page_to_image(pdf_path: str, page_num: int, dpi: int = 300) -> Image.Image:
    """
    Convert a specific PDF page to high-resolution image
    """
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    
    # Create transformation matrix for high DPI
    mat = fitz.Matrix(dpi/72, dpi/72)
    
    # Render page to pixmap
    pix = page.get_pixmap(matrix=mat)
    
    # Convert to PIL Image
    img_data = pix.tobytes("ppm")
    image = Image.open(io.BytesIO(img_data))
    
    doc.close()
    return image

def extract_text_with_ocr(pdf_path: str, page_num: int, dpi: int = 300) -> str:
    """
    Extract text from PDF page using OCR
    """
    try:
        # Convert PDF page to image
        image = pdf_page_to_image(pdf_path, page_num, dpi)
        
        # Convert PIL image to numpy array
        img_array = np.array(image)
        
        # Preprocess image for better OCR
        processed_img = preprocess_image_for_ocr(img_array)
        
        # Convert back to PIL Image for pytesseract
        pil_image = Image.fromarray(processed_img)
        
        # OCR configuration for Bengali
        custom_config = r'--oem 3 --psm 6 -l ben+eng'  # Bengali + English
        
        # Extract text using OCR
        text = pytesseract.image_to_string(pil_image, config=custom_config)
        
        return text.strip()
        
    except Exception as e:
        print(f"Error extracting text from page {page_num}: {e}")
        return ""

def preprocess_bengali_text(text: str) -> str:
    """
    Preprocess Bengali text for better processing
    """
    if not text:
        return ""
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common OCR artifacts
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007F\u2000-\u206F\u2E00-\u2E7F]', ' ', text)
    
    # Clean up punctuation spacing
    text = re.sub(r'\s+([ред,;:!?])', r'\1', text)
    text = re.sub(r'([ред,;:!?])\s*', r'\1 ', text)
    
    return text.strip()

In [6]:
import io  # For BytesIO
from deep_translator import GoogleTranslator

def detect_language(text: str) -> str:
    """
    Detect the language of the given text
    """
    try:
        if not text.strip():
            return 'unknown'
        
        # Use langdetect for initial detection
        detected = detect(text)
        
        # Bengali is often detected as 'bn'
        if detected == 'bn':
            return 'bengali'
        elif detected == 'en':
            return 'english'
        else:
            # Check for Bengali Unicode characters
            bengali_chars = sum(1 for char in text if '\u0980' <= char <= '\u09FF')
            total_chars = len([char for char in text if char.isalpha()])
            
            if total_chars > 0 and (bengali_chars / total_chars) > 0.3:
                return 'bengali'
            else:
                return 'english'
    except Exception as e:
        print(f"Language detection error: {e}")
        # Fallback: check for Bengali characters
        bengali_chars = sum(1 for char in text if '\u0980' <= char <= '\u09FF')
        if bengali_chars > 5:  # Arbitrary threshold
            return 'bengali'
        return 'english'

def translate_text_to_english(text: str) -> dict:
    """
    Translate Bengali text to English and return both versions with metadata
    
    Returns:
        dict: Contains original text, translated text, detected language, and confidence
    """
    if not text.strip():
        return {
            'original_text': text,
            'translated_text': text,
            'original_language': 'unknown',
            'translation_confidence': 0.0,
            'translation_method': 'none'
        }
    
    # Detect language
    detected_lang = detect_language(text)
    
    if detected_lang == 'english':
        # Already English, no translation needed
        return {
            'original_text': text,
            'translated_text': text,
            'original_language': 'english',
            'translation_confidence': 1.0,
            'translation_method': 'none'
        }
    
    # Translate Bengali to English
    try:
        # Add small delay to avoid rate limiting
        time.sleep(0.2)
        
        # Use deep-translator which is more stable
        translator = GoogleTranslator(source='bn', target='en')
        translated_text = translator.translate(text)
        
        return {
            'original_text': text,
            'translated_text': translated_text,
            'original_language': 'bengali',
            'translation_confidence': 0.9,  # deep-translator doesn't provide confidence, so we use default
            'translation_method': 'deep_translator'
        }
        
    except Exception as e:
        print(f"Translation error: {e}")
        # Fallback: return original text
        return {
            'original_text': text,
            'translated_text': text,
            'original_language': detected_lang,
            'translation_confidence': 0.0,
            'translation_method': 'failed'
        }

def translate_query_to_english(query: str) -> dict:
    """
    Translate user query to English for vector search
    """
    return translate_text_to_english(query)

def translate_response_to_bengali(response: str, target_language: str = 'bengali') -> str:
    """
    Translate English response back to Bengali if needed
    """
    if target_language == 'english':
        return response
    
    try:
        time.sleep(0.2)  # Rate limiting
        translator = GoogleTranslator(source='en', target='bn')
        translated_text = translator.translate(response)
        return translated_text
        
    except Exception as e:
        print(f"Response translation error: {e}")
        return response  # Return original if translation fails

In [7]:
# Test translation functionality
print("ЁЯзк Testing translation function...")

# Test with Bengali text
test_bengali = "ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?"
result = translate_text_to_english(test_bengali)

print(f"Original: {result['original_text']}")
print(f"Translated: {result['translated_text']}")
print(f"Language: {result['original_language']}")
print(f"Confidence: {result['translation_confidence']}")
print(f"Method: {result['translation_method']}")

# Test with English text
test_english = "Where does Harish work?"
result2 = translate_text_to_english(test_english)

print(f"\nEnglish test:")
print(f"Original: {result2['original_text']}")
print(f"Translated: {result2['translated_text']}")
print(f"Language: {result2['original_language']}")
print(f"Method: {result2['translation_method']}")

ЁЯзк Testing translation function...
Original: ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?
Translated: Where does Anupam's friend Harish work?
Language: bengali
Confidence: 0.9
Method: deep_translator

English test:
Original: Where does Harish work?
Translated: Where does Harish work?
Language: english
Method: none


In [8]:
# Fresh test with deep-translator directly
from deep_translator import GoogleTranslator
from langdetect import detect
import time

print("ЁЯзк Fresh translation test with deep-translator...")

# Test Bengali to English
test_bengali = "ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?"
translator = GoogleTranslator(source='bn', target='en')
translated = translator.translate(test_bengali)

print(f"Bengali: {test_bengali}")
print(f"English: {translated}")

# Test language detection
detected_lang = detect(test_bengali)
print(f"Detected language: {detected_lang}")

# Test English to Bengali
english_text = "Where does Harish work?"
translator_back = GoogleTranslator(source='en', target='bn')
bengali_result = translator_back.translate(english_text)

print(f"\nReverse translation:")
print(f"English: {english_text}")
print(f"Bengali: {bengali_result}")

print("тЬЕ Deep-translator is working correctly!")

ЁЯзк Fresh translation test with deep-translator...
Bengali: ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?
English: Where does Anupam's friend Harish work?
Detected language: bn

Reverse translation:
English: Where does Harish work?
Bengali: рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?
тЬЕ Deep-translator is working correctly!


In [9]:
def ingest_bengali_documents_with_translation(
    pdf_path: str = "Data/HSC26-Bangla1st-Paper.pdf",
    start_page: Optional[int] = None,
    end_page: Optional[int] = None,
    dpi: int = 300,
    embedding_model: str = "text-embedding-3-small",
    save_translations: bool = True
):
    """
    Ingest Bengali PDF documents using OCR, translate to English, and create vector store
    
    Args:
        pdf_path: Path to the PDF file
        start_page: Starting page number (0-indexed). If None, starts from beginning
        end_page: Ending page number (0-indexed, inclusive). If None, goes to end
        dpi: DPI for image conversion (higher = better quality but slower)
        embedding_model: OpenAI embedding model to use
        save_translations: Whether to save translation metadata to files
    """
    
    # Open PDF to get page count
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()
    
    # Set page range
    start_page = start_page if start_page is not None else 0
    end_page = end_page if end_page is not None else total_pages - 1
    
    # Validate page range
    start_page = max(0, min(start_page, total_pages - 1))
    end_page = max(start_page, min(end_page, total_pages - 1))
    
    print(f"ЁЯФД Processing pages {start_page + 1} to {end_page + 1} (total: {end_page - start_page + 1} pages)")
    print(f"ЁЯУД PDF has {total_pages} total pages")
    print(f"ЁЯМР Using OpenAI embedding model: {embedding_model}")
    print(f"ЁЯФд Translation: Bengali тЖТ English тЖТ Vector Store")
    
    # Extract and translate text from each page
    documents = []
    translation_log = []
    
    for page_num in range(start_page, end_page + 1):
        print(f"\nЁЯУЦ Processing page {page_num + 1}/{total_pages}...")
        
        # Extract text using OCR
        raw_text = extract_text_with_ocr(pdf_path, page_num, dpi)
        
        if raw_text.strip():
            # Preprocess Bengali text
            cleaned_text = preprocess_bengali_text(raw_text)
            
            if cleaned_text:
                print(f"   ЁЯУЭ Extracted: {len(cleaned_text)} characters")
                
                # Translate text to English
                print(f"   ЁЯМР Translating to English...")
                translation_result = translate_text_to_english(cleaned_text)
                
                translated_text = translation_result['translated_text']
                original_language = translation_result['original_language']
                confidence = translation_result['translation_confidence']
                
                print(f"   тЬЕ Translation complete (confidence: {confidence:.2f})")
                print(f"   ЁЯУК Language: {original_language}")
                print(f"   ЁЯУП English text: {len(translated_text)} characters")
                
                # Create document with English text for vector store
                doc = Document(
                    page_content=translated_text,  # Store English translation
                    metadata={
                        "source": pdf_path,
                        "page": page_num + 1,
                        "extraction_method": "OCR",
                        "dpi": dpi,
                        "embedding_model": embedding_model,
                        "original_language": original_language,
                        "translation_confidence": confidence,
                        "original_text_length": len(cleaned_text),
                        "translated_text_length": len(translated_text),
                        "has_translation": original_language == 'bengali'
                    }
                )
                documents.append(doc)
                
                # Keep translation log
                translation_log.append({
                    "page": page_num + 1,
                    "original_text": cleaned_text,
                    "translated_text": translated_text,
                    "original_language": original_language,
                    "confidence": confidence
                })
                
            else:
                print(f"   тЪая╕П No valid text after cleaning")
        else:
            print(f"   тЭМ No text extracted")
    
    if not documents:
        print("тЭМ No text was extracted from any pages!")
        return None
    
    print(f"\nЁЯУД Total documents created: {len(documents)}")
    
    # Save translation log if requested
    if save_translations and translation_log:
        translation_file = f"translation_log_pages_{start_page + 1}-{end_page + 1}.json"
        with open(translation_file, 'w', encoding='utf-8') as f:
            json.dump(translation_log, f, ensure_ascii=False, indent=2)
        print(f"ЁЯТ╛ Translation log saved: {translation_file}")
    
    # Split the translated documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Optimized for English text
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
        separators=[
            "\n\n",  # Paragraph breaks
            "\n",    # Line breaks
            ". ",    # English sentences
            "? ",    # Questions
            "! ",    # Exclamations
            " ",     # Space
            "",      # Character level
        ],
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"ЁЯУЭ Split {len(documents)} documents into {len(chunks)} chunks.")
    
    # Create embeddings for English text
    print(f"ЁЯФД Creating English embeddings using {embedding_model}...")
    embedding = OpenAIEmbeddings(model=embedding_model)
    
    # Create vector store with English translations
    vector_store = Chroma.from_documents(
        documents=chunks, 
        embedding=embedding, 
        persist_directory="./bengali_translated_english_db"  # New directory for translated content
    )
    
    print("тЬЕ Bengali document ingestion with translation completed!")
    print(f"ЁЯОп Vector store contains English translations for multilingual search")
    
    return vector_store

In [10]:
# Run this to create the translated English vector store from Bengali documents
# This approach allows both Bengali and English queries to work on the same vector store

# Process pages and translate to English for unified vector store
ingest_bengali_documents_with_translation(
    pdf_path="Data/HSC26-Bangla1st-Paper.pdf",
    start_page=1,    # Same page range as before
    end_page=50,      # Same page range as before
    dpi=400,          # High DPI for better OCR
    embedding_model="text-embedding-3-small",
    save_translations=True  # Save translation log for reference
)

ЁЯФД Processing pages 2 to 49 (total: 48 pages)
ЁЯУД PDF has 49 total pages
ЁЯМР Using OpenAI embedding model: text-embedding-3-small
ЁЯФд Translation: Bengali тЖТ English тЖТ Vector Store

ЁЯУЦ Processing page 2/49...
   ЁЯУЭ Extracted: 1275 characters
   ЁЯМР Translating to English...
   тЬЕ Translation complete (confidence: 0.90)
   ЁЯУК Language: bengali
   ЁЯУП English text: 784 characters

ЁЯУЦ Processing page 3/49...
   ЁЯУЭ Extracted: 1162 characters
   ЁЯМР Translating to English...
   тЬЕ Translation complete (confidence: 0.90)
   ЁЯУК Language: bengali
   ЁЯУП English text: 939 characters

ЁЯУЦ Processing page 4/49...
   ЁЯУЭ Extracted: 471 characters
   ЁЯМР Translating to English...
   тЬЕ Translation complete (confidence: 0.90)
   ЁЯУК Language: bengali
   ЁЯУП English text: 370 characters

ЁЯУЦ Processing page 5/49...
   ЁЯУЭ Extracted: 185 characters
   ЁЯМР Translating to English...
   тЬЕ Translation complete (confidence: 0.90)
   ЁЯУК Language: bengali
   ЁЯУП Englis

<langchain_community.vectorstores.chroma.Chroma at 0x215272c3dd0>

In [11]:
# Utility function to preview OCR results before full processing
def preview_ocr_extraction(
    pdf_path: str = "Data/HSC26-Bangla1st-Paper.pdf", 
    page_num: int = 0, 
    dpi: int = 300
):
    """
    Preview OCR extraction results for a specific page
    """
    print(f"ЁЯФН Previewing OCR extraction for page {page_num + 1}")
    print("=" * 60)
    
    # Extract text using OCR
    raw_text = extract_text_with_ocr(pdf_path, page_num, dpi)
    cleaned_text = preprocess_bengali_text(raw_text)
    
    print(f"ЁЯУД Raw text length: {len(raw_text)} characters")
    print(f"ЁЯз╣ Cleaned text length: {len(cleaned_text)} characters")
    print("\n" + "=" * 60)
    print("ЁЯУЭ Raw OCR Output (first 500 chars):")
    print("-" * 40)
    print(raw_text[:500])
    print("\n" + "=" * 60)
    print("тЬи Cleaned Text (first 500 chars):")
    print("-" * 40)
    print(cleaned_text[:500])
    
    return raw_text, cleaned_text

# Test OCR on first page
preview_ocr_extraction("Data/HSC26-Bangla1st-Paper.pdf", page_num=42, dpi=300)

ЁЯФН Previewing OCR extraction for page 43
ЁЯУД Raw text length: 1914 characters
ЁЯз╣ Cleaned text length: 1912 characters

ЁЯУЭ Raw OCR Output (first 500 chars):
----------------------------------------
1911
ржкрзНрж░рж╢рзНрже- рзи: ржкржбрж╝рж╛рж╢рзБржирж╛ рж╢рзЗрж╖ ржХрж░рзЗ рж╕ржмрж┐рждрж╛ ржПржЦржи ржЧрзНрж░рж╛ржорзЗрж░ ржПржХржЯрж┐ рж╕рж░ржХрж╛рж░рж┐ ржкрзНрж░рж╛ржЗржорж╛рж░рж┐ рж╕рзНржХрзБрж▓рзЗ рж╢рж┐ржХрзНрж╖ржХрждрж╛ ржХрж░рзЗржиред ржмржЫрж░
ржХржпрж╝рзЗржХ ржЖржЧрзЗ рж╢рж╣рж░рзЗрж░ ржПржХ ржзржирзА ржмрзНржпржмрж╕рж╛ржпрж╝рзАрж░ ржЫрзЗрж▓рзЗрж░ рж╕рж╛ржерзЗ рждрж╛рж░ ржмрж┐ржмрж╛рж╣ рж╕рзНржерж┐рж░ рж╣ржпрж╝ред ржкрж╛рждрзНрж░ржкржХрзНрж╖ ржмрж┐ржпрж╝рзЗрждрзЗ ржорзЛржЯрж╛ ржЕржЩрзНржХрзЗрж░
ржпрзМрждрзБржХ ржжрж╛ржмрж┐ ржХрж░рж▓рзЗ рждрж╛рж░ ржЖрждрзНржорж╕ржорзНржорж╛ржирзЗ ржЖржШрж╛ржд рж▓рж╛ржЧрзЗред рж╕ржмрж┐рждрж╛ ржирж┐ржЬрзЗржЗ ржпрзМрждрзБржХржХрзЗ ржкрзНрж░рждрзНржпрж╛ржЦрзНржпрж╛ржи ржХрж░рзЗ ржмрж┐ржпрж╝рзЗ ржирж╛ ржХрж░рж╛рж░
рж╕рж┐ржжрзНржзрж╛ржирзНрждрзЗ ржЕр

('1911\nржкрзНрж░рж╢рзНрже- рзи: ржкржбрж╝рж╛рж╢рзБржирж╛ рж╢рзЗрж╖ ржХрж░рзЗ рж╕ржмрж┐рждрж╛ ржПржЦржи ржЧрзНрж░рж╛ржорзЗрж░ ржПржХржЯрж┐ рж╕рж░ржХрж╛рж░рж┐ ржкрзНрж░рж╛ржЗржорж╛рж░рж┐ рж╕рзНржХрзБрж▓рзЗ рж╢рж┐ржХрзНрж╖ржХрждрж╛ ржХрж░рзЗржиред ржмржЫрж░\nржХржпрж╝рзЗржХ ржЖржЧрзЗ рж╢рж╣рж░рзЗрж░ ржПржХ ржзржирзА ржмрзНржпржмрж╕рж╛ржпрж╝рзАрж░ ржЫрзЗрж▓рзЗрж░ рж╕рж╛ржерзЗ рждрж╛рж░ ржмрж┐ржмрж╛рж╣ рж╕рзНржерж┐рж░ рж╣ржпрж╝ред ржкрж╛рждрзНрж░ржкржХрзНрж╖ ржмрж┐ржпрж╝рзЗрждрзЗ ржорзЛржЯрж╛ ржЕржЩрзНржХрзЗрж░\nржпрзМрждрзБржХ ржжрж╛ржмрж┐ ржХрж░рж▓рзЗ рждрж╛рж░ ржЖрждрзНржорж╕ржорзНржорж╛ржирзЗ ржЖржШрж╛ржд рж▓рж╛ржЧрзЗред рж╕ржмрж┐рждрж╛ ржирж┐ржЬрзЗржЗ ржпрзМрждрзБржХржХрзЗ ржкрзНрж░рждрзНржпрж╛ржЦрзНржпрж╛ржи ржХрж░рзЗ ржмрж┐ржпрж╝рзЗ ржирж╛ ржХрж░рж╛рж░\nрж╕рж┐ржжрзНржзрж╛ржирзНрждрзЗ ржЕржЯрж▓ ржерж╛ржХрзЗржиред ржкрж┐рждрж╛ржорж╛рждрж╛ ржУ рж╕рж╣ржХрж░рзНржорзАржжрзЗрж░ ржЕржирзЗржХ ржЕржирзБрж░рзЛржз рж╕рждрзНрждрзНржмрзЗржУ рждрж┐ржирж┐ рждрж╛рж░ ржЪрж┐ржирзНрждрж╛-ржЪрзЗрждржирж

In [12]:
def create_multilingual_rag_chain(embedding_model: str = "text-embedding-3-small"):
    """
    Create RAG chain that works with translated English vector store but supports multilingual queries
    
    Args:
        embedding_model: OpenAI embedding model to use for retrieval
    """
    # Use OpenAI GPT model with multilingual capabilities
    model = ChatOpenAI(
        model="gpt-4",  # GPT-4 has excellent multilingual support
        temperature=0.2,
        max_tokens=1500
    )
    
    # Multilingual prompt template that can handle both Bengali and English
    prompt = PromptTemplate.from_template(
        """
        You are a helpful multilingual assistant. You have access to context that was originally in Bengali but has been translated to English for processing.
        
        IMPORTANT INSTRUCTIONS:
        1. Answer based only on the provided context
        2. If the user asks in Bengali, respond in Bengali
        3. If the user asks in English, respond in English  
        4. If you don't know the answer, say "No context available for this question" in the same language as the question
        5. The context provided is English translations of originally Bengali content
        
        User Question: {input}
        Context (English translations): {context}
        
        Answer (respond in the same language as the question):
        """
    )
    
    # Load vector store with English translations
    print(f"ЁЯФД Loading English vector store with OpenAI embeddings ({embedding_model})...")
    embedding = OpenAIEmbeddings(model=embedding_model)
    vector_store = Chroma(
        persist_directory="./bengali_translated_english_db", 
        embedding_function=embedding
    )

    # Create retriever
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 5,  # Retrieve more documents for better context
            "score_threshold": 0.1,  # Lower threshold for OpenAI embeddings
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [13]:
def ask_multilingual(query: str, embedding_model: str = "text-embedding-3-small", show_translation_details: bool = True):
    """
    Ask questions in Bengali or English using the translated vector store
    
    Args:
        query: The question to ask (in Bengali or English)
        embedding_model: OpenAI embedding model to use for retrieval
        show_translation_details: Whether to show translation process details
    """
    
    print(f"ЁЯФН Processing query: '{query}'")
    
    # Detect and translate query if needed
    query_translation = translate_query_to_english(query)
    original_language = query_translation['original_language']
    english_query = query_translation['translated_text']
    
    if show_translation_details:
        print(f"ЁЯМР Detected language: {original_language}")
        if original_language == 'bengali':
            print(f"ЁЯФд English translation: '{english_query}'")
        print(f"ЁЯУК Translation confidence: {query_translation['translation_confidence']:.2f}")
    
    # Search vector store with English query
    print(f"ЁЯФН Searching English vector store...")
    chain = create_multilingual_rag_chain(embedding_model=embedding_model)
    
    # Use original query for context but search happens with English translation internally
    result = chain.invoke({"input": query})
    
    # The model should respond in the same language as the original query
    answer = result["answer"]
    
    # Print results
    print("\n" + "=" * 60)
    print("ржЙрждрзНрждрж░ / Answer:")
    print("-" * 50)
    print(answer)
    
    if show_translation_details:
        print(f"\nЁЯМР Response language detection: {detect_language(answer)}")
    
    print("\n" + "=" * 60)
    print("рж╕рзВрждрзНрж░ / Sources (from English translations):")
    print("-" * 50)
    
    for i, doc in enumerate(result["context"], 1):
        print(f"{i}. Source: {doc.metadata['source']}")
        if 'page' in doc.metadata:
            print(f"   Page: {doc.metadata['page']}")
        if 'original_language' in doc.metadata:
            print(f"   Original Language: {doc.metadata['original_language']}")
        if 'translation_confidence' in doc.metadata:
            print(f"   Translation Confidence: {doc.metadata['translation_confidence']:.2f}")
        print(f"   Content preview (English): {doc.page_content[:100]}...")
        print()
    
    return {
        'original_query': query,
        'english_query': english_query,
        'query_language': original_language,
        'answer': answer,
        'context': result["context"],
        'translation_details': query_translation
    }

In [14]:
# Example: Search for similar content using OpenAI embeddings
# search_similar_bengali_content("ржЖржорж┐ ржЖрж╢рж╛", k=3, embedding_model="text-embedding-3-small")

In [15]:
# Test with Bengali question - should work with English vector store via translation
ask_multilingual("ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?", embedding_model="text-embedding-3-small")

ЁЯФН Processing query: 'ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?'
ЁЯМР Detected language: bengali
ЁЯФд English translation: 'Where does Anupam's friend Harish work?'
ЁЯУК Translation confidence: 0.90
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...


  vector_store = Chroma(



ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрж╛ржиржкрзБрж░рзЗ ржХрж╛ржЬ ржХрж░рзЗред

ЁЯМР Response language detection: bengali

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 43
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): 19111] ржкрзНрж░рж╢рзНржи- рзи: ржкржбрж╝рж╛рж╢рзБржирж╛ рж╢рзЗрж╖ ржХрж░рзЗ рж╕ржмрж┐рждрж╛ ржПржЦржи ржЧрзНрж░рж╛ржорзЗрж░ ржПржХржЯрж┐ рж╕рж░ржХрж╛рж░рж┐ ржкрзНрж░рж╛ржЗржорж╛рж░рж┐ рж╕рзНржХрзБрж▓рзЗ рж╢рж┐ржХрзНрж╖ржХрждрж╛ ржХрж░рзЗржи  ржмржЫрж░...

2. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 43
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): 19111] ржкрзНрж░рж╢рзНржи- рзи: ржкржбрж╝рж╛рж╢рзБржирж╛ рж╢рзЗрж╖ ржХрж░рзЗ рж╕ржмрж┐рждрж╛ ржПржЦржи ржЧрзНрж░рж╛ржорзЗрж░ рж

{'original_query': 'ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрзЛржерж╛ржпрж╝ ржХрж╛ржЬ ржХрж░рзЗ?',
 'english_query': "Where does Anupam's friend Harish work?",
 'query_language': 'bengali',
 'answer': 'ржЕржирзБржкржорзЗрж░ ржмржирзНржзрзБ рж╣рж░рж┐рж╢ ржХрж╛ржиржкрзБрж░рзЗ ржХрж╛ржЬ ржХрж░рзЗред',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 1906, 'page': 43, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 0, 'translated_text_length': 1906, 'translation_confidence': 0.0}, page_content='19111] ржкрзНрж░рж╢рзНржи- рзи: ржкржбрж╝рж╛рж╢рзБржирж╛ рж╢рзЗрж╖ ржХрж░рзЗ рж╕ржмрж┐рждрж╛ ржПржЦржи ржЧрзНрж░рж╛ржорзЗрж░ ржПржХржЯрж┐ рж╕рж░ржХрж╛рж░рж┐ ржкрзНрж░рж╛ржЗржорж╛рж░рж┐ рж╕рзНржХрзБрж▓рзЗ рж╢рж┐ржХрзНрж╖ржХрждрж╛ ржХрж░рзЗржи  ржмржЫрж░ ржХржпрж╝рзЗржХ ржЖржЧрзЗ рж╢рж╣рж░рзЗрж░ ржПржХ ржзржирзА ржмрзНржпржмрж╕рж╛

In [16]:
# Test with English question - should work directly with English vector store
ask_multilingual("Where does Anupam's friend Harish work?", embedding_model="text-embedding-3-small")

ЁЯФН Processing query: 'Where does Anupam's friend Harish work?'
ЁЯМР Detected language: english
ЁЯУК Translation confidence: 1.00
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...

ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
Harish works in Kolkata.

ЁЯМР Response language detection: english

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 33
   Original Language: bengali
   Translation Confidence: 0.90
   Content preview (English): ? (A) BA side (b) MA side (c) BSc side (d) MSc side 1 'girl am, but whose quote is? (A) Anupam (b) H...

2. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 32
   Original Language: bengali
   Translation Confidence: 0.90
   Content preview (English): 281 Who is Anupam's friend in the stranger story? [D.B. C 21-5] (a) Binuda (b) Kalyani (c) Hari

{'original_query': "Where does Anupam's friend Harish work?",
 'english_query': "Where does Anupam's friend Harish work?",
 'query_language': 'english',
 'answer': 'Harish works in Kolkata.',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 1361, 'page': 33, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 892, 'translated_text_length': 1536, 'translation_confidence': 0.9}, page_content="? (A) BA side (b) MA side (c) BSc side (d) MSc side 1 'girl am, but whose quote is? (A) Anupam (b) Harish (c) Wala (d) Mama's 3 'unfamiliar' story, who is a joke in the story? (A) Anupam (b) Ghatak (c) Harish (d) Mama 20 'Once the mama talk to the uncle, whose statement is? (A) Binuda (b) Shastunath (c) Harish (d) Where did the 20 Harish of Anupam work? (A) In Kolkata (b) Andaman (c) Rajpur (d) 22 'in Kanpur' at one time their clan was filled wit

In [17]:
ask_multilingual("ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржХрзЛржи ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗ?", embedding_model="text-embedding-3-small")

ЁЯФН Processing query: 'ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржХрзЛржи ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗ?'
ЁЯМР Detected language: bengali
ЁЯФд English translation: 'What is Kalyani accepts after marriage breaks?'
ЁЯУК Translation confidence: 0.90
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...

ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржорзЗржпрж╝рзЗржжрзЗрж░ рж╢рж┐ржХрзНрж╖рж╛рж░ ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗред

ЁЯМР Response language detection: bengali

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 46
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): . ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХр

{'original_query': 'ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржХрзЛржи ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗ?',
 'english_query': 'What is Kalyani accepts after marriage breaks?',
 'query_language': 'bengali',
 'answer': 'ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржорзЗржпрж╝рзЗржжрзЗрж░ рж╢рж┐ржХрзНрж╖рж╛рж░ ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗред',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 2221, 'page': 46, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 1473, 'translated_text_length': 2221, 'translation_confidence': 0.0}, page_content='. ржмрж┐ржмрж╛рж╣ ржнрж╛ржЩрж╛рж░ ржкрж░ рж╣рждрзЗ ржХрж▓рзНржпрж╛ржгрзА ржХрзЛржи ржмрзНрж░ржд ржЧрзНрж░рж╣ржг ржХрж░рзЗ? ржЦ. "ржПржЯрж╛ ржЖржкржирж╛ржжрзЗрж░ ржЬрж┐ржирж┐рж╕, ржЖржкржирж╛ржжрзЗрж░ ржХрж╛ржЫрзЗржЗ ржерж╛ржХ 

In [18]:
ask_multilingual("ржмрж┐ржмрж╛рж╣рзЗрж░ рж╕ржорзЯ ржХрж▓рзНржпрж╛ржгрзАрж░ ржкрзНрж░ржХрзГржд ржмрзЯрж╕ ржХржд ржЫрж┐рж▓?", embedding_model="text-embedding-3-small")

ЁЯФН Processing query: 'ржмрж┐ржмрж╛рж╣рзЗрж░ рж╕ржорзЯ ржХрж▓рзНржпрж╛ржгрзАрж░ ржкрзНрж░ржХрзГржд ржмрзЯрж╕ ржХржд ржЫрж┐рж▓?'
ЁЯМР Detected language: bengali
ЁЯФд English translation: 'What was the actual age of the welfare at the time of marriage?'
ЁЯУК Translation confidence: 0.90
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...

ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
ржПржЗ ржкрзНрж░рж╢рзНржирзЗрж░ ржЬржирзНржп ржХрзЛржи ржкрзНрж░рж╕ржЩрзНржЧ ржЙржкрж▓ржмрзНржз ржирзЗржЗред

ЁЯМР Response language detection: bengali

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 8
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): ! ржпрзЗржЦрж╛ржирзЗ ржЖржорж░рж╛ ржмрж▓рж┐ 'ржЪржорзОржХрж╛рж░' рж╕рзЗржЦрж╛ржирзЗ рждрж┐ржирж┐ ржмрж▓рзЗр

{'original_query': 'ржмрж┐ржмрж╛рж╣рзЗрж░ рж╕ржорзЯ ржХрж▓рзНржпрж╛ржгрзАрж░ ржкрзНрж░ржХрзГржд ржмрзЯрж╕ ржХржд ржЫрж┐рж▓?',
 'english_query': 'What was the actual age of the welfare at the time of marriage?',
 'query_language': 'bengali',
 'answer': 'ржПржЗ ржкрзНрж░рж╢рзНржирзЗрж░ ржЬржирзНржп ржХрзЛржи ржкрзНрж░рж╕ржЩрзНржЧ ржЙржкрж▓ржмрзНржз ржирзЗржЗред',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 2396, 'page': 8, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 59, 'translated_text_length': 2396, 'translation_confidence': 0.0}, page_content="! ржпрзЗржЦрж╛ржирзЗ ржЖржорж░рж╛ ржмрж▓рж┐ 'ржЪржорзОржХрж╛рж░' рж╕рзЗржЦрж╛ржирзЗ рждрж┐ржирж┐ ржмрж▓рзЗржи тАШBATHSтАЩ | ржЕрждржПржм ржмрзБржЭрж┐рж▓рж╛ржо, ржЖржорж╛рж░ ржнрж╛ржЧрзНржпрзЗ ржкрзНрж░ржЬрж╛ржкрждрж┐рж░ рж╕ржЩрзНржЧрзЗ ржкржЮрзНржЪрж╢рж░рзЗрж░ ржХрзЛржирзЛ ржмрж┐

In [19]:
ask_multilingual("ржЕржирзБржкрзЗржорж░ ржнрж╛рж╖рж╛рзЯ рж╕рзБржкрзБрж░рзБрж╖ ржХрж╛ржХрзЗ ржмрж▓рж╛ рж╣рзЯрзЗржЫрзЗ?")

ЁЯФН Processing query: 'ржЕржирзБржкрзЗржорж░ ржнрж╛рж╖рж╛рзЯ рж╕рзБржкрзБрж░рзБрж╖ ржХрж╛ржХрзЗ ржмрж▓рж╛ рж╣рзЯрзЗржЫрзЗ?'
ЁЯМР Detected language: bengali
ЁЯФд English translation: 'Who is said to be a gentleman in the language of Anupam?'
ЁЯУК Translation confidence: 0.90
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...

ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
рж╕рзБржкрзБрж░рзБрж╖ ржмржЯрзЗ ржмрж▓рж╛ рж╣рзЯрзЗржЫрзЗред

ЁЯМР Response language detection: bengali

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 8
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): ! ржпрзЗржЦрж╛ржирзЗ ржЖржорж░рж╛ ржмрж▓рж┐ 'ржЪржорзОржХрж╛рж░' рж╕рзЗржЦрж╛ржирзЗ рждрж┐ржирж┐ ржмрж▓рзЗржи тАШBATHSтАЩ | ржЕрждржПржм ржмрзБржЭрж┐рж▓рж╛ржо, ржЖржо

{'original_query': 'ржЕржирзБржкрзЗржорж░ ржнрж╛рж╖рж╛рзЯ рж╕рзБржкрзБрж░рзБрж╖ ржХрж╛ржХрзЗ ржмрж▓рж╛ рж╣рзЯрзЗржЫрзЗ?',
 'english_query': 'Who is said to be a gentleman in the language of Anupam?',
 'query_language': 'bengali',
 'answer': 'рж╕рзБржкрзБрж░рзБрж╖ ржмржЯрзЗ ржмрж▓рж╛ рж╣рзЯрзЗржЫрзЗред',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 2396, 'page': 8, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 59, 'translated_text_length': 2396, 'translation_confidence': 0.0}, page_content="! ржпрзЗржЦрж╛ржирзЗ ржЖржорж░рж╛ ржмрж▓рж┐ 'ржЪржорзОржХрж╛рж░' рж╕рзЗржЦрж╛ржирзЗ рждрж┐ржирж┐ ржмрж▓рзЗржи тАШBATHSтАЩ | ржЕрждржПржм ржмрзБржЭрж┐рж▓рж╛ржо, ржЖржорж╛рж░ ржнрж╛ржЧрзНржпрзЗ ржкрзНрж░ржЬрж╛ржкрждрж┐рж░ рж╕ржЩрзНржЧрзЗ ржкржЮрзНржЪрж╢рж░рзЗрж░ ржХрзЛржирзЛ ржмрж┐рж░рзЛржз ржирж╛ржЗ  ржмрж▓рж╛ ржмрж╛рж╣рзБрж▓рзНржп, ржмрж

In [20]:
ask_multilingual("ржХрзЗ ржЕржирзБржкрзЗржорж░ ржнрж╛ржЧрзНржп ржжрзЗржмрждрж╛ ржмрж▓рзЗ ржЙрж▓рзНрж▓рзЗржЦ ржХрж░рж╛ рж╣рзЯрзЗржЫрзЗ?")

ЁЯФН Processing query: 'ржХрзЗ ржЕржирзБржкрзЗржорж░ ржнрж╛ржЧрзНржп ржжрзЗржмрждрж╛ ржмрж▓рзЗ ржЙрж▓рзНрж▓рзЗржЦ ржХрж░рж╛ рж╣рзЯрзЗржЫрзЗ?'
ЁЯМР Detected language: bengali
ЁЯФд English translation: 'Who is referred to as the god of the fate of Anupam?'
ЁЯУК Translation confidence: 0.90
ЁЯФН Searching English vector store...
ЁЯФД Loading English vector store with OpenAI embeddings (text-embedding-3-small)...

ржЙрждрзНрждрж░ / Answer:
--------------------------------------------------
ржПржЗ ржкрзНрж░рж╢рзНржирзЗрж░ ржЙрждрзНрждрж░рзЗрж░ ржЬржирзНржп ржкрзНрж░ржжрждрзНржд ржкрзНрж░рж╕ржЩрзНржЧ ржерзЗржХрзЗ ржХрзЛржи рждржерзНржп ржкрж╛ржУрзЯрж╛ ржпрж╛рзЯржирж┐ред

ЁЯМР Response language detection: bengali

рж╕рзВрждрзНрж░ / Sources (from English translations):
--------------------------------------------------
1. Source: Data/HSC26-Bangla1st-Paper.pdf
   Page: 49
   Original Language: bengali
   Translation Confidence: 0.00
   Content preview (English): . 'рж╕рзЗржЗ рж▓ржЧрзНржирзЗ рж

{'original_query': 'ржХрзЗ ржЕржирзБржкрзЗржорж░ ржнрж╛ржЧрзНржп ржжрзЗржмрждрж╛ ржмрж▓рзЗ ржЙрж▓рзНрж▓рзЗржЦ ржХрж░рж╛ рж╣рзЯрзЗржЫрзЗ?',
 'english_query': 'Who is referred to as the god of the fate of Anupam?',
 'query_language': 'bengali',
 'answer': 'ржПржЗ ржкрзНрж░рж╢рзНржирзЗрж░ ржЙрждрзНрждрж░рзЗрж░ ржЬржирзНржп ржкрзНрж░ржжрждрзНржд ржкрзНрж░рж╕ржЩрзНржЧ ржерзЗржХрзЗ ржХрзЛржи рждржерзНржп ржкрж╛ржУрзЯрж╛ ржпрж╛рзЯржирж┐ред',
 'context': [Document(metadata={'dpi': 400, 'embedding_model': 'text-embedding-3-small', 'extraction_method': 'OCR', 'has_translation': True, 'original_language': 'bengali', 'original_text_length': 1068, 'page': 49, 'source': 'Data/HSC26-Bangla1st-Paper.pdf', 'start_index': 184, 'translated_text_length': 1068, 'translation_confidence': 0.0}, page_content='. \'рж╕рзЗржЗ рж▓ржЧрзНржирзЗ ржПрж╕рзЗржЫрж┐ ржкрж╛рж▓рж┐ржпрж╝рзЗ\'- ржП ржЪрж░ржгрзЗрж░ ржЖрж▓рзЛржХрзЗ ржЙржжрзНржжрзАржкржХрзЗрж░ ржирж╛ржпрж╝ржХрзЗрж░ ржорждрзЛ ржЕржирзБржкржорзЗрж░ ржмрж┐рж░рж╣рз