In [5]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import tempfile
from docx import Document

In [6]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [None]:
def extract_text_from_image(image_path_or_bytes):
    if isinstance(image_path_or_bytes, bytes):
        image = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        image = Image.open(image_path_or_bytes)
    return pytesseract.image_to_string(image)

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""

    for page_num, page in enumerate(doc):
        text += f"\n--- Page {page_num + 1} ---\n"
        # Extract text
        page_text = page.get_text()
        if page_text:
            text += page_text

        # Extract images for OCR
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            ocr_text = extract_text_from_image(image_bytes)
            text += f"\n[Image {img_index + 1} OCR]:\n{ocr_text}\n"

    doc.close()
    return text.strip()

def extract_text_from_docx(file_path):
    text = ""
    doc = Document(file_path)

    # Extract plain text
    for para in doc.paragraphs:
        text += para.text + "\n"

    # Extract images and apply OCR
    for rel in doc.part._rels:
        rel_obj = doc.part._rels[rel]
        if "image" in rel_obj.target_ref:
            image_data = rel_obj.target_part.blob
            ocr_text = extract_text_from_image(image_data)
            text += f"\n[Image OCR from DOCX]:\n{ocr_text}\n"
    return text.strip()

def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return extract_text_from_pdf(file_path)
    elif ext == '.docx':
        return extract_text_from_docx(file_path)
    elif ext == '.txt':
        return extract_text_from_txt(file_path)
    elif ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
        return extract_text_from_image(file_path)
    else:
        return f"Unsupported file format: {ext}"


In [None]:
file_path = "Initial Implementation Plan - AI-Driven Document Compliance Analysis System (3).pdf"  # Replace with your file path
result_text = extract_text(file_path)
print("📄 Extracted Text:\n")
print(result_text)