In [10]:
import os
import random
import string
import cv2
import numpy as np
import matplotlib.pyplot as plt

from typing import Tuple, Literal, List
from PIL import Image, ImageDraw, ImageFont, ImageFilter
from wonderwords import RandomWord
from tqdm.auto import tqdm

In [11]:
def generate_text_captcha(text, captcha_type: str = "easy", min_width: int = 512, height: int=224) -> Tuple[Image.Image, str]:
    """
    Generates a text-based CAPTCHA image.
    
    Args:
        text (str): The text to be included in the CAPTCHA.
        captcha_type (str): The type of CAPTCHA to generate ("easy", "hard", "bonus").
        min_width (int): Minimum width of the CAPTCHA image.
        height (int): Height of the CAPTCHA image.
    
    Returns:
        Tuple[Image.Image, str]: A tuple containing the CAPTCHA image and the text.
    
    Easy CAPTCHA: Renders text in Arial font in the center of a white background. 

    Hard CAPTCHA: Renders text in a random font in the center of a random background, adds the following distortions:
        - Random background color ((r, g, b) where each is differently in [200-255])
        - Random font color ((r, g, b) where each is differently in [0-100])
        - Random font size in [50%-70%] of image height
        - 5% rainbow noise
        - Gaussian blur with radius 0.5 pixels
        
    Bonus CAPTCHA: Renders text in a random font in the center of a random background, and follows the Hard CAPTCHA distortions in addition to:
        - Randomly mirrors the text horizontally with 50% probability.
        - If the text is mirrored, background color is red, else green.
    """
    font_height = 156 if captcha_type == "easy" else (random.randint(int(0.5 * height), int(0.7 * height)))
    font_face = "fonts/arial.ttf" if captcha_type == "easy" else "fonts/" + random.choice(os.listdir("fonts")) 
    font = ImageFont.truetype(font_face, font_height)
    text_length = font.getlength(text)
    width = int(max(min_width, text_length + 80))
    if captcha_type == "easy":
        image = Image.new("RGB", (width, height), (255, 255, 255))
    elif captcha_type == "hard":
        image = Image.new("RGB", (width, height),(random.randint(200, 255), random.randint(200, 255), random.randint(200, 255)))
    elif captcha_type == "bonus":
        if random.random() < 0.5:
            image = Image.new("RGB", (width, height), (255, 0, 0))
            text = text[::-1]  
        else:
            image = Image.new("RGB", (width, height), (0, 255, 0))
    else:
        raise ValueError("Invalid captcha_type!")
    draw = ImageDraw.Draw(image)
    x = (width - text_length) // 2
    y = (height - font_height) // 2
    text_color = 'black' if captcha_type == 'easy' else (random.randint(0, 100), random.randint(0, 100), random.randint(0, 100))
    draw.text((x, y), text=text, font=font, fill=text_color)
    if captcha_type != "easy":
        pixels = np.array(image)
        noise = np.random.randint(0, 255, pixels.shape, dtype='uint8')
        pixels = np.where(np.random.rand(*pixels.shape) < 0.05, noise, pixels)
        image = Image.fromarray(pixels)
        image = image.filter(ImageFilter.GaussianBlur(radius=0.5))
    return image, text
    

In [13]:
def preprocess_image(image_path: str) -> np.ndarray:
    """
    Reads an image and applies binarization and noise removal.
    References: 
        - /references/Pre-Processing in OCR!!!. A basic explanation of the most widely… _ by Susmith Reddy _ TDS Archive _ Medium.pdf
        
    Args:
        image_path (str): Path to the input image. 
        
    Returns:"
        np.ndarray: Preprocessed binary image.
        
    Converts the image to grayscale, applies Otsu's thresholding for binarization.
    Denoising is performed using median blur with a kernel size of 3.
    Morphological opening (erosion -> dilation) is applied to remove small noise particularly near character edges.
    """
    img = cv2.imread(image_path)
    if img is None:
        return None
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    denoised = cv2.medianBlur(binary, 3)
    kernel = np.ones((2,2), np.uint8)
    processed_img = cv2.morphologyEx(denoised, cv2.MORPH_OPEN, kernel)
    return processed_img

In [14]:
def segment_characters(binary_img: np.ndarray) -> List[np.ndarray]:
    """
    Segments the binary image using Vertical Histogram Projection.
    References: 
        - /references/Pre-Processing in OCR!!!. A basic explanation of the most widely… _ by Susmith Reddy _ TDS Archive _ Medium.pdf
        - /references/Segmentation in OCR !!. A basic explanation of different levels… _ by Susmith Reddy _ TDS Archive _ Medium.pdf
        - /references/What is an OCR __. A basic theoretical overview of the… _ by Susmith Reddy _ TDS Archive _ Medium.pdf
        
    Args:
        binary_img (np.ndarray): Preprocessed binary image.
        
    Returns:
        List[np.ndarray]: List of segmented character images.
        
    Tries the following linear shear transformations to maximize a sharpness heuristic:
    [[1, shear, 0],
     [0,     1, 0]] 
    where shear values are from the following array:
    array([-0.5 , -0.45, -0.4 , -0.35, -0.3 , -0.25, -0.2 , -0.15, -0.1 , -0.05,  0.  ,  0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35, 0.4 ,  0.45,  0.5 ])
    In angular terms, this corresponds to shear angles from approximately -26.57 degrees to +26.57 degrees in 2.86 degree increments.
    
    Sharpness heuristic:
        The sharpness score is defined as the sum of squared differences between consecutive histogram values.
        The shear transformation that yields the highest sharpness score is selected for final segmentation.
        
    Nearest interpolation is used during the shear transformation to preserve binary values.
    
    After selecting the best shear, vertical histogram projection is performed on the sheared image to identify character boundaries.
    At the moment, a column with zero pixel sum is used as a boundary between characters, however, more sophisticated methods like thresholding can be used in the future. 
    """
    h, w = binary_img.shape
    best_shear = 0
    max_score = -1
    shear_range = np.linspace(-0.5, 0.5, 21) 
    for shear in shear_range:
        M = np.float32([[1, shear, 0], [0, 1, 0]])
        sheared_img = cv2.warpAffine(binary_img, M, (w, h), flags=cv2.INTER_NEAREST)
        hist = np.sum(sheared_img, axis=0)
        score = np.sum((hist[1:] - hist[:-1]) ** 2)
        if score > max_score:
            max_score = score
            best_shear = shear
    pixel_sum_boundary_threshold = 0
    M_best = np.float32([[1, best_shear, 0], [0, 1, 0]])
    final_img = cv2.warpAffine(binary_img, M_best, (w, h), flags=cv2.INTER_NEAREST)
    vertical_hist = np.sum(final_img, axis=0) / 255
    segments = []
    in_segment = False
    start_col = 0
    for col in range(w):
        pixel_sum = vertical_hist[col]
        if pixel_sum > 0 and not in_segment:
            in_segment = True
            start_col = col
        elif pixel_sum == pixel_sum_boundary_threshold and in_segment:
            in_segment = False
            char_crop = final_img[:, start_col:col]
            segments.append(char_crop)
    if in_segment:
        char_crop = final_img[:, start_col:w]
        segments.append(char_crop)   
    return segments

In [15]:
def process_and_save_dataset(source_folder: str, output_folder: str) -> None:
    """
    Orchestrator function to process all images and save characters.
    
    Args:
        source_folder (str): Folder containing source CAPTCHA images.
        output_folder (str): Folder to save segmented character images.
    
    Returns:
        None
        
    Orchestrates the preprocessing, segmentation, and saving of individual character images.
    """
    mismatch_count = 0
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    files = [f for f in os.listdir(source_folder) if f.endswith('.png')]
    print(f"Processing {len(files)} images...")
    for filename in tqdm(files, desc="Processing images"):
        file_path = os.path.join(source_folder, filename)
        ground_truth_text = os.path.splitext(filename)[0]
        binary_img = preprocess_image(file_path)
        if binary_img is None:
            continue
        char_imgs = segment_characters(binary_img)
        if len(char_imgs) == len(ground_truth_text):
            for i, char_img in enumerate(char_imgs):
                char_label = ground_truth_text[i]
                save_name = f"{ground_truth_text}_{i}_{char_label}.png"
                os.makedirs(os.path.join(output_folder, f"{char_label}"), exist_ok=True)
                save_path = os.path.join(output_folder, f"{char_label}", save_name)
                cv2.imwrite(save_path, char_img)
        else:
            mismatch_count += 1
    print(f"Processing complete. {mismatch_count} mismatches found.")

In [16]:
generate_dataset("data/generated/", (10000, 10000, 5000), "mixed_captcha", 0.5)

english_easy: 100%|██████████| 5000/5000 [13:46<00:00,  6.05it/s]
english_hard: 100%|██████████| 5000/5000 [16:34<00:00,  5.03it/s]
english_bonus: 100%|██████████| 2500/2500 [08:32<00:00,  4.88it/s]
random_easy: 100%|██████████| 5000/5000 [00:24<00:00, 205.67it/s]
random_hard: 100%|██████████| 5000/5000 [03:21<00:00, 24.86it/s]
random_bonus: 100%|██████████| 2500/2500 [01:40<00:00, 24.98it/s]


In [17]:
process_and_save_dataset('data/generated/hard/', 'data/processed/characters/')
process_and_save_dataset('data/generated/easy/', 'data/processed/characters/')

Processing 8703 images...


Processing images: 100%|██████████| 8703/8703 [01:04<00:00, 135.31it/s]


Processing complete. 909 mismatches found.
Processing 8698 images...


Processing images: 100%|██████████| 8698/8698 [00:49<00:00, 174.26it/s]

Processing complete. 563 mismatches found.



