In [None]:
!pip install pdf2image opencv-python-headless numpy scipy pytesseract
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-kan

import cv2
import numpy as np
from pdf2image import convert_from_path
from google.colab import files
from google.colab.patches import cv2_imshow
import tempfile
import os
from scipy.spatial.distance import cosine
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
OCR_LANGUAGE = 'kan'  # Kannada Language Code

## Utility Functions

def upload_file():
    uploaded = files.upload()
    return next(iter(uploaded))

def show_image(img):
    cv2_imshow(img)

## Main Class

class KannadaHandwrittenTextSearch:
    def __init__(self):
        self.document_index = {}
        self.page_images = []

    def preprocess_image(self, image):
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        return binary

    def extract_text(self, image):
        return pytesseract.image_to_string(image, lang=OCR_LANGUAGE)

    def index_document(self, document_path, document_id):
        try:
            with tempfile.TemporaryDirectory() as path:
                self.page_images = convert_from_path(document_path, output_folder=path)

                for i, image in enumerate(self.page_images):
                    print(f"Processing page {i+1}")
                    image_np = np.array(image)
                    preprocessed = self.preprocess_image(image_np)
                    text = self.extract_text(preprocessed)

                    self.document_index[document_id] = self.document_index.get(document_id, []) + [(text, i, image_np)]
                    print(f"Extracted text from page {i+1}: {text[:100]}...")  # Print first 100 chars for debugging

                    # Display the first page for debugging purposes
                    if i == 0:
                        print("First page processed:")
                        show_image(image_np)

        except Exception as e:
            print(f"An error occurred while indexing the document: {str(e)}")

    def highlight_text(self, image, phrase):
        # Highlight the regions where the phrase appears
        h, w, _ = image.shape
        data = pytesseract.image_to_data(image, lang=OCR_LANGUAGE, output_type=pytesseract.Output.DICT)

        n_boxes = len(data['level'])
        for i in range(n_boxes):
            if phrase.lower() in data['text'][i].lower():
                (x, y, width, height) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
                cv2.rectangle(image, (x, y), (x + width, y + height), (0, 255, 0), 2)

        return image

    def search_phrase(self, phrase, document_id):
        # Search the phrase in the indexed document
        if document_id not in self.document_index:
            print(f"Document ID {document_id} not found.")
            return []

        results = []
        for text, page_num, image in self.document_index[document_id]:
            if phrase.lower() in text.lower():
                results.append((page_num, image))

        return results

## Usage Example

# Create an instance of the search system
searcher = KannadaHandwrittenTextSearch()

# Upload and index a document
print("Please upload a Kannada handwritten document (PDF):")
doc_path = upload_file()
print("Indexing the document. This may take a while...")
searcher.index_document(doc_path, "doc1")
print("Indexing complete.")

# Print some stats about the indexed document
print(f"Total pages indexed: {len(searcher.page_images)}")

# Continuous search loop
while True:
    print("\nEnter a phrase to search in the document, or type 'quit' to exit:")
    phrase = input().strip()
    if phrase.lower() == 'quit':
        break

    print(f"Searching for the phrase '{phrase}'...")
    results = searcher.search_phrase(phrase, "doc1")

    if results:
        print("\nSearch results:")
        for page_num, image in results:
            print(f"Phrase found on page {page_num+1}")

            # Highlight text in the image
            highlighted_image = searcher.highlight_text(np.array(image), phrase)

            # Display the page with highlighted text
            print(f"Showing page {page_num+1} with highlighted phrase:")
            show_image(highlighted_image)
    else:
        print("No matches found.")

    print("\n" + "-"*50 + "\n")

print("Search session ended.")

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-kan tesseract-ocr-osd
0 upgraded, 5 newly installed, 0 to remove and 49 not upgraded.
Need to get 6,661 kB of archives.
After this operation, 20.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Get