In [19]:
# Required imports for PDF text extraction
import os
import pandas as pd  # For timestamp in text files
import fitz  # PyMuPDF for PDF handling
from typing import Optional, List, Tuple

# OCR and text processing imports
try:
    import pytesseract
    import cv2
    from PIL import Image
    import numpy as np
    print("✅ All OCR dependencies imported successfully")
except ImportError as e:
    print(f"⚠️ Missing OCR dependencies: {e}")
    print("Please install with: pip install pytesseract opencv-python pillow numpy")
    print("Also install Tesseract OCR: https://github.com/UB-Mannheim/tesseract/wiki")

✅ All OCR dependencies imported successfully


In [20]:
# OCR and text processing functions
import re
import io

def preprocess_image_for_ocr(image_array: np.ndarray) -> np.ndarray:
    """
    Preprocess image for better OCR results
    """
    # Convert to grayscale if needed
    if len(image_array.shape) == 3:
        gray = cv2.cvtColor(image_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = image_array
    
    # Increase contrast and brightness
    alpha = 1.2  # Contrast control
    beta = 10    # Brightness control
    adjusted = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)
    
    # Apply Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(adjusted, (1, 1), 0)
    
    # Apply threshold to get binary image
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Morphological operations to clean up the image
    kernel = np.ones((1, 1), np.uint8)
    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    
    return cleaned

def pdf_page_to_image(pdf_path: str, page_num: int, dpi: int = 300) -> Image.Image:
    """
    Convert a specific PDF page to high-resolution image
    """
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    
    # Create transformation matrix for high DPI
    mat = fitz.Matrix(dpi/72, dpi/72)
    
    # Render page to pixmap
    pix = page.get_pixmap(matrix=mat)
    
    # Convert to PIL Image
    img_data = pix.tobytes("ppm")
    image = Image.open(io.BytesIO(img_data))
    
    doc.close()
    return image

def extract_text_with_ocr(pdf_path: str, page_num: int, dpi: int = 300) -> str:
    """
    Extract text from PDF page using OCR
    """
    try:
        # Convert PDF page to image
        image = pdf_page_to_image(pdf_path, page_num, dpi)
        
        # Convert PIL image to numpy array
        img_array = np.array(image)
        
        # Preprocess image for better OCR
        processed_img = preprocess_image_for_ocr(img_array)
        
        # Convert back to PIL Image for pytesseract
        pil_image = Image.fromarray(processed_img)
        
        # OCR configuration for Bengali
        custom_config = r'--oem 3 --psm 6 -l ben+eng'  # Bengali + English
        
        # Extract text using OCR
        text = pytesseract.image_to_string(pil_image, config=custom_config)
        
        return text.strip()
        
    except Exception as e:
        print(f"Error extracting text from page {page_num}: {e}")
        return ""

def preprocess_bengali_text(text: str) -> str:
    """
    Preprocess Bengali text for better processing
    """
    if not text:
        return ""
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common OCR artifacts
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007F\u2000-\u206F\u2E00-\u2E7F]', ' ', text)
    
    # Clean up punctuation spacing
    text = re.sub(r'\s+([।,;:!?])', r'\1', text)
    text = re.sub(r'([।,;:!?])\s*', r'\1 ', text)
    
    return text.strip()

In [21]:
# Tesseract OCR Configuration
# Set the path to tesseract executable (Windows)
# Adjust this path based on your Tesseract installation
try:
    # Common Windows path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    
    # Alternative Windows paths (uncomment if needed)
    # pytesseract.pytesseract.tesseract_cmd = r'C:\Users\YourUsername\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
    
    # For Linux/Mac, it's usually in PATH, so you might not need to set this
    # pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
    
    # Test if Bengali is available
    available_langs = pytesseract.get_languages()
    if 'ben' in available_langs:
        print("✅ Bengali language support is available in Tesseract")
    else:
        print("❌ Bengali language support not found. Please install Bengali traineddata.")
        print("Download ben.traineddata from: https://github.com/tesseract-ocr/tessdata")
        print("Place it in your Tesseract tessdata directory")
    
    # Show available languages
    print(f"📋 Available languages: {', '.join(available_langs)}")
    
except Exception as e:
    print(f"⚠️ Tesseract configuration issue: {e}")
    print("Please make sure Tesseract is properly installed and configured.")
    print("Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki")
    print("Or use: pip install tesseract (for some systems)")

✅ Bengali language support is available in Tesseract
📋 Available languages: afr, amh, ara, asm, aze, aze_cyrl, bel, ben, bod, bos, bre, bul, cat, ceb, ces, chi_sim, chi_sim_vert, chi_tra, chi_tra_vert, chr, cos, cym, dan, deu, eng, equ, hun, hye, iku, ind, isl, ita, ita_old, jav, jpn, jpn_vert, kan, kat, kat_old, kaz, khm, kir, kor, osd, urd, uzb


In [22]:
def save_extracted_text_to_files(
    pdf_path: str = "Data/HSC26-Bangla1st-Paper.pdf",
    start_page: Optional[int] = None,
    end_page: Optional[int] = None,
    dpi: int = 300,
    output_dir: str = "extracted_text"
):
    """
    Extract text from PDF using OCR and save to individual .txt files
    
    Args:
        pdf_path: Path to the PDF file
        start_page: Starting page number (0-indexed). If None, starts from beginning
        end_page: Ending page number (0-indexed, inclusive). If None, goes to end
        dpi: DPI for image conversion
        output_dir: Directory to save text files
    """
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"📁 Created directory: {output_dir}")
    
    # Open PDF to get page count
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()
    
    # Set page range
    start_page = start_page if start_page is not None else 0
    end_page = end_page if end_page is not None else total_pages - 1
    
    # Validate page range
    start_page = max(0, min(start_page, total_pages - 1))
    end_page = max(start_page, min(end_page, total_pages - 1))
    
    print(f"💾 Saving extracted text from pages {start_page + 1} to {end_page + 1}")
    print(f"📄 PDF has {total_pages} total pages")
    
    # Get base filename for naming
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    # Extract and save text from each page
    all_text = []
    successful_pages = []
    
    for page_num in range(start_page, end_page + 1):
        print(f"Processing page {page_num + 1}/{total_pages}...")
        
        # Extract text using OCR
        raw_text = extract_text_with_ocr(pdf_path, page_num, dpi)
        cleaned_text = preprocess_bengali_text(raw_text)
        
        if cleaned_text.strip():
            # Save individual page text
            page_filename = f"{base_name}_page_{page_num + 1:03d}.txt"
            page_filepath = os.path.join(output_dir, page_filename)
            
            with open(page_filepath, 'w', encoding='utf-8') as f:
                f.write(f"Page {page_num + 1} - Raw Text:\n")
                f.write("=" * 50 + "\n")
                f.write(raw_text)
                f.write("\n\n" + "=" * 50 + "\n")
                f.write(f"Page {page_num + 1} - Cleaned Text:\n")
                f.write("=" * 50 + "\n")
                f.write(cleaned_text)
                f.write("\n\n" + "=" * 50 + "\n")
            
            # all_text.append(f"=== PAGE {page_num + 1} ===\n{cleaned_text}\n")
            all_text.append(f"{cleaned_text}\n")
            successful_pages.append(page_num + 1)
            
            print(f"✅ Saved: {page_filename} ({len(cleaned_text)} characters)")
        else:
            print(f"⚠️ Page {page_num + 1}: No text extracted, skipping file creation")
    
    # Save combined text file
    if all_text:
        combined_filename = f"{base_name}_combined_pages_{start_page + 1}-{end_page + 1}.txt"
        combined_filepath = os.path.join(output_dir, combined_filename)
        
        with open(combined_filepath, 'w', encoding='utf-8') as f:
            # f.write(f"Combined Text from {pdf_path}\n")
            # f.write(f"Pages: {start_page + 1} to {end_page + 1}\n")
            # f.write(f"Extraction Date: {pd.Timestamp.now()}\n")
            # f.write(f"DPI: {dpi}\n")
            # f.write(f"Total Pages Processed: {len(successful_pages)}\n")
            # f.write("=" * 80 + "\n\n")
            
            for text in all_text:
                f.write(text + "\n")
                # f.write("\n" + "-" * 80 + "\n\n")
        
        print(f"\n📄 Combined file saved: {combined_filename}")
        print(f"📊 Successfully processed {len(successful_pages)} pages")
        print(f"📁 All files saved in: {output_dir}")
        
        # Create summary file
        summary_filename = f"{base_name}_extraction_summary.txt"
        summary_filepath = os.path.join(output_dir, summary_filename)
        
        with open(summary_filepath, 'w', encoding='utf-8') as f:
            f.write(f"Text Extraction Summary\n")
            f.write("=" * 50 + "\n")
            f.write(f"Source PDF: {pdf_path}\n")
            f.write(f"Page Range: {start_page + 1} to {end_page + 1}\n")
            f.write(f"Total Pages in PDF: {total_pages}\n")
            f.write(f"Successfully Processed: {len(successful_pages)} pages\n")
            f.write(f"DPI Setting: {dpi}\n")
            f.write(f"Extraction Date: {pd.Timestamp.now()}\n")
            f.write(f"Output Directory: {output_dir}\n\n")
            
            f.write("Successfully Processed Pages:\n")
            f.write("-" * 30 + "\n")
            for page in successful_pages:
                f.write(f"Page {page}\n")
            
            f.write(f"\nFiles Created:\n")
            f.write("-" * 20 + "\n")
            f.write(f"- Individual page files: {len(successful_pages)} files\n")
            f.write(f"- Combined text file: {combined_filename}\n")
            f.write(f"- Summary file: {summary_filename}\n")
        
        print(f"📋 Summary saved: {summary_filename}")
        
        return successful_pages, output_dir
    else:
        print("❌ No text was extracted from any pages!")
        return [], output_dir

In [26]:
# 💾 Extract and save text to .txt files for backup and analysis
# This will create individual page files and a combined file

# Example 1: Save text from the same page range used for vector store
successful_pages, output_dir = save_extracted_text_to_files(
    pdf_path="Data/HSC26-Bangla1st-Paper.pdf",
    start_page=5,    # Same as vector store range
    end_page=19,      # Same as vector store range
    dpi=400,          # Same DPI for consistency
    output_dir="extracted_text_bengali"  # Custom directory name
)

print(f"\n🎉 Text extraction completed!")
print(f"📁 Files saved in: {output_dir}")
print(f"📄 Successfully processed: {len(successful_pages)} pages")

💾 Saving extracted text from pages 6 to 20
📄 PDF has 49 total pages
Processing page 6/49...
✅ Saved: HSC26-Bangla1st-Paper_page_006.txt (1764 characters)
Processing page 7/49...
✅ Saved: HSC26-Bangla1st-Paper_page_007.txt (1685 characters)
Processing page 8/49...
✅ Saved: HSC26-Bangla1st-Paper_page_008.txt (2396 characters)
Processing page 9/49...
✅ Saved: HSC26-Bangla1st-Paper_page_009.txt (1591 characters)
Processing page 10/49...
✅ Saved: HSC26-Bangla1st-Paper_page_010.txt (1275 characters)
Processing page 11/49...
✅ Saved: HSC26-Bangla1st-Paper_page_011.txt (1238 characters)
Processing page 12/49...
✅ Saved: HSC26-Bangla1st-Paper_page_012.txt (2078 characters)
Processing page 13/49...
✅ Saved: HSC26-Bangla1st-Paper_page_013.txt (1688 characters)
Processing page 14/49...
✅ Saved: HSC26-Bangla1st-Paper_page_014.txt (1739 characters)
Processing page 15/49...
✅ Saved: HSC26-Bangla1st-Paper_page_015.txt (2079 characters)
Processing page 16/49...
✅ Saved: HSC26-Bangla1st-Paper_page_016.t

In [24]:
# 📁 Additional utility functions for text file management

def list_extracted_text_files(output_dir: str = "extracted_text_bengali"):
    """
    List all extracted text files in the directory
    """
    if not os.path.exists(output_dir):
        print(f"❌ Directory {output_dir} does not exist")
        return
    
    files = [f for f in os.listdir(output_dir) if f.endswith('.txt')]
    
    if not files:
        print(f"📁 No .txt files found in {output_dir}")
        return
    
    print(f"📄 Found {len(files)} text files in {output_dir}:")
    print("=" * 60)
    
    # Separate different types of files
    page_files = [f for f in files if '_page_' in f]
    combined_files = [f for f in files if '_combined_' in f]
    summary_files = [f for f in files if '_summary' in f]
    
    if page_files:
        print(f"\n📄 Individual Page Files ({len(page_files)}):")
        for f in sorted(page_files):
            file_path = os.path.join(output_dir, f)
            size = os.path.getsize(file_path)
            print(f"  - {f} ({size:,} bytes)")
    
    if combined_files:
        print(f"\n📚 Combined Files ({len(combined_files)}):")
        for f in combined_files:
            file_path = os.path.join(output_dir, f)
            size = os.path.getsize(file_path)
            print(f"  - {f} ({size:,} bytes)")
    
    if summary_files:
        print(f"\n📋 Summary Files ({len(summary_files)}):")
        for f in summary_files:
            file_path = os.path.join(output_dir, f)
            size = os.path.getsize(file_path)
            print(f"  - {f} ({size:,} bytes)")

def read_extracted_page(page_num: int, output_dir: str = "extracted_text_bengali", show_raw: bool = False):
    """
    Read and display extracted text from a specific page
    """
    # Find the page file
    files = os.listdir(output_dir) if os.path.exists(output_dir) else []
    page_file = None
    
    for f in files:
        if f'_page_{page_num:03d}.txt' in f:
            page_file = f
            break
    
    if not page_file:
        print(f"❌ Page {page_num} text file not found in {output_dir}")
        return
    
    file_path = os.path.join(output_dir, page_file)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    print(f"📄 Content from Page {page_num}:")
    print("=" * 60)
    
    if show_raw:
        print(content)
    else:
        # Show only cleaned text
        parts = content.split("Cleaned Text:")
        if len(parts) > 1:
            cleaned_part = parts[1].split("=" * 50)[0].strip()
            print(cleaned_part[:1000] + "..." if len(cleaned_part) > 1000 else cleaned_part)
        else:
            print(content[:1000] + "..." if len(content) > 1000 else content)

# Example usage:
# list_extracted_text_files()
# read_extracted_page(43)  # Read page 43 content