In [3]:
import os
import cv2
import numpy as np
import fitz
from PIL import Image

In [4]:
def preprocess_image(image):
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian blur to remove noise
    blurred = cv2.GaussianBlur(gray, (9, 9), 0)

    # Apply adaptive thresholding to enhance contrast
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 21, 5)

    return thresh

def enlarge_image(image, scale_factor=3):
    # Enlarge the image with Lanczos interpolation
    enlarged_image = cv2.resize(image, (0, 0), fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LANCZOS4)
    
    return enlarged_image

def enhance_quality(image):
    # Sharpen the image
    sharpened = cv2.filter2D(image, -1, np.array([[-1, -1, -1],
                                                  [-1,  9, -1],
                                                  [-1, -1, -1]]))
    
    # Denoise the image
    denoised = cv2.fastNlMeansDenoisingColored(sharpened, None, 10, 10, 7, 21)
    
    return denoised


def extract_alphabets(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    pdf_document = fitz.open(pdf_path)
    x_coords = []
    y_coords = []

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        page_image = page.get_pixmap()
        np_page_image = np.frombuffer(page_image.samples, dtype=np.uint8).reshape((page_image.height, page_image.width, page_image.n))

        # Preprocess image
        processed_image = preprocess_image(np_page_image)

        # Find contours
        contours, _ = cv2.findContours(processed_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        for contour in contours:
            # Get bounding box of each contour
            x, y, w, h = cv2.boundingRect(contour)

            x_coords.append(x)
            y_coords.append(y)

            # Crop, enlarge, and enhance quality of region containing alphabet
            alphabet_region = np_page_image[y:y+h, x:x+w]
            enlarged_region = enlarge_image(alphabet_region)
            enhanced_region = enhance_quality(enlarged_region)

            # Save region containing alphabet
            alphabet_image = Image.fromarray(enhanced_region)
            alphabet_image.save(f"{output_folder}/alphabet_{page_num}_{x}_{y}.png")

# Example usage
pdf_path = "semi_book.pdf"
output_folder = "segmented_data"
extract_alphabets(pdf_path, output_folder)