# Detector (FAST)

In [2]:
import os
%matplotlib inline
import torch
from doctr.io import DocumentFile
from doctr.models import ocr_predictor, fast_base, detection_predictor

os.environ['USE_TORCH'] = '1'
print(os.environ['USE_TORCH'])

# Initialize the default OCR predictor
# default_predictor = detection_predictor('fast_base',pretrained=True)

# Load custom detection model
detector_model_path = "checkpoints/FAST/fast_base_20240926-134200_epoch100.pt"
det_model = fast_base(pretrained=False, pretrained_backbone=False)
det_params = torch.load(detector_model_path, map_location="cpu")
det_model.load_state_dict(det_params)
trained_predictor = ocr_predictor(det_arch=det_model, reco_arch="crnn_vgg16_bn", pretrained=True)


1


  det_params = torch.load(detector_model_path, map_location="cpu")


### Visualise Random Images

In [None]:
import os
import random

def process_random_images_from_directory(directory, trained_predictor, num_images=5):
    # List all image files in the directory
    image_filenames = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    # Select 5 random images
    random_filenames = random.sample(image_filenames, num_images)
    
    # Process each randomly selected image
    for image_filename in random_filenames:
        image_path = os.path.join(directory, image_filename)
        print(f"Processing: {image_filename}")
        
        # Load and process the image using doctr's DocumentFile and trained_predictor
        doc = DocumentFile.from_images(image_path)
        result = trained_predictor(doc)
        
        # Display the result (or handle as needed)
        result.show()

# Example usage:
# directory = "old datasets\Grayscaled nielit form\Processed"
directory = "files/input/Single Sample Inference"
process_random_images_from_directory(directory, trained_predictor, num_images=1)


## Extract BBOX in COCO Format

### Batch

In [None]:
import os
import json
from doctr.io import DocumentFile
from PIL import Image

image_dir = "files/input/Single Sample Inference"
output_file = "files/input/Single Sample Inference/detection.json"
# model = default_predictor
model = trained_predictor

# Initialize COCO data structure
coco_output = {
    "images": [],
    "annotations": [],
    "categories": [{
        "id": 1,
        "name": "word",
        "supercategory": "text",
    }]
}

annotation_id = 1  # Unique annotation ID
image_id = 1  # Unique image ID

# Loop over all files in the directory
for filename in os.listdir(image_dir):
    if filename.endswith(".png") or filename.endswith(".jpg"):
        image_path = os.path.join(image_dir, filename)
        
        # Load the document and image
        doc = DocumentFile.from_images(image_path)
        image = Image.open(image_path)
        width, height = image.size

        # Add image info to COCO structure
        coco_output['images'].append({
            "file_name": filename,
            "height": height,
            "width": width,
            "id": image_id
        })

        # Get the result from the model
        result = model(doc)
        # Access the blocks of text in the document prediction
        for page in result.pages:
            for block in page.blocks:
                for line in block.lines:
                    for word in line.words:
                        # Get the bounding box
                        bbox = word.geometry  # [x_min, y_min, x_max, y_max]
                        bbox_coco = [
                            bbox[0][0] * width,  # x_min
                            bbox[0][1] * height,  # y_min
                            (bbox[1][0] - bbox[0][0]) * width,  # width
                            (bbox[1][1] - bbox[0][1]) * height  # height
                        ]

                        # Append annotation to COCO structure
                        coco_output['annotations'].append({
                            "id": annotation_id,
                            "image_id": image_id,
                            "category_id": 1,  # Word category
                            "bbox": bbox_coco,
                            "area": bbox_coco[2] * bbox_coco[3],
                            "iscrowd": 0
                        })

                        annotation_id += 1

        image_id += 1

# Write the COCO JSON structure to the output file
with open(output_file, "w") as file:
    json.dump(coco_output, file)

print(f"COCO annotations saved in {output_file}")


In [3]:
import os
import torch
import json
from PIL import Image
from doctr.io import DocumentFile
from doctr.models import ocr_predictor, fast_base, detection_predictor

os.environ["USE_TORCH"] = "1"
print(os.environ["USE_TORCH"])

# Initialize the default OCR predictor
# default_predictor = detection_predictor('fast_base',pretrained=True)

# Load custom detection model
detector_model_path = "checkpoints/FAST/fast_base_20240926-134200_epoch100.pt"
det_model = fast_base(pretrained=False, pretrained_backbone=False)
det_params = torch.load(detector_model_path, map_location="cpu")
det_model.load_state_dict(det_params)

trained_predictor = ocr_predictor(
    det_arch=det_model, 
    reco_arch="crnn_vgg16_bn", 
    pretrained=True
)
# image_dir = "files/input/Single Sample Inference"
image_path = "files/input/Single Sample Inference/0001_front.jpg"
output_file = "files/input/Single Sample Inference/detection.json"
# model = default_predictor
model = trained_predictor

# Initialize COCO data structure
coco_output = {
    "images": [],
    "annotations": [],
    "categories": [{
        "id": 1,
        "name": "word",
        "supercategory": "text",
    }]
}

annotation_id = 1  # Unique annotation ID
image_id = 1  # Unique image ID

# Load the document and image
doc = DocumentFile.from_images(image_path)
image = Image.open(image_path)
width, height = image.size

# Add image info to COCO structure
coco_output["images"].append(
    {
        "file_name": os.path.basename(image_path),
        "height": height,
        "width": width,
        "id": image_id,
    }
)

# Get the result from the model
result = model(doc)
# Access the blocks of text in the document prediction
for page in result.pages:
    for block in page.blocks:
        for line in block.lines:
            for word in line.words:
                # Get the bounding box
                bbox = word.geometry  # [x_min, y_min, x_max, y_max]
                bbox_coco = [
                    bbox[0][0] * width,  # x_min
                    bbox[0][1] * height,  # y_min
                    (bbox[1][0] - bbox[0][0]) * width,  # width
                    (bbox[1][1] - bbox[0][1]) * height,  # height
                ]

                # Append annotation to COCO structure
                coco_output["annotations"].append(
                    {
                        "id": annotation_id,
                        "image_id": image_id,
                        "category_id": 1,  # Word category
                        "bbox": bbox_coco,
                        "area": bbox_coco[2] * bbox_coco[3],
                        "iscrowd": 0,
                    }
                )

                annotation_id += 1

# Write the COCO JSON structure to the output file
with open(output_file, "w") as file:
    json.dump(coco_output, file)

print(f"COCO annotations saved in {output_file}")

1


  det_params = torch.load(detector_model_path, map_location="cpu")


COCO annotations saved in files/input/Single Sample Inference/detection.json


# Recognizer (TrOCR)

### Box Merger

In [None]:
import json
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class BoundingBox:
    x: float
    y: float
    width: float
    height: float

    @property
    def left(self) -> float:
        return self.x

    @property
    def right(self) -> float:
        return self.x + self.width

    @property
    def top(self) -> float:
        return self.y

    @property
    def bottom(self) -> float:
        return self.y + self.height

def load_coco_annotations(file_path: str) -> Dict:
    with open(file_path, "r") as f:
        return json.load(f)

def merge_line_boxes(
    boxes: List[BoundingBox], distance_threshold: float = 20
) -> List[BoundingBox]:
    if not boxes:
        return []

    # Sort boxes by their top-left corner
    sorted_boxes = sorted(boxes, key=lambda b: (b.top, b.left))
    merged_boxes = []
    current_line = [sorted_boxes[0]]

    for box in sorted_boxes[1:]:
        last_box = current_line[-1]

        # Check if the box is on the same line
        if (
            abs(box.top - last_box.top) <= distance_threshold
            and abs(box.bottom - last_box.bottom) <= distance_threshold
            and (box.left - last_box.right) <= distance_threshold
        ):
            current_line.append(box)
        else:
            # Merge the current line and start a new one
            merged_boxes.append(merge_boxes(current_line))
            current_line = [box]

    # Merge the last line
    if current_line:
        merged_boxes.append(merge_boxes(current_line))

    return merged_boxes

def merge_boxes(boxes: List[BoundingBox]) -> BoundingBox:
    left = min(box.left for box in boxes)
    top = min(box.top for box in boxes)
    right = max(box.right for box in boxes)
    bottom = max(box.bottom for box in boxes)
    return BoundingBox(left, top, right - left, bottom - top)

def process_coco_annotations(annotations: List[Dict]) -> List[BoundingBox]:
    boxes = [BoundingBox(*ann["bbox"]) for ann in annotations]
    return merge_line_boxes(boxes)

def create_merged_coco_annotations(
    original_coco: Dict, merged_boxes: List[BoundingBox]
) -> Dict:
    merged_coco = original_coco.copy()
    merged_coco["annotations"] = [
        {
            "id": i + 1,
            "image_id": original_coco["annotations"][0]["image_id"],
            "category_id": 1,
            "bbox": [box.x, box.y, box.width, box.height],
            "area": box.width * box.height,
            "iscrowd": 0,
        }
        for i, box in enumerate(merged_boxes)
    ]
    merged_coco["categories"] = [
        {"id": 1, "name": "text_line", "supercategory": "text"}
    ]
    return merged_coco

def save_coco_annotations(coco_data: Dict, output_file_path: str):
    with open(output_file_path, "w") as f:
        json.dump(coco_data, f, indent=2)

def main(input_coco_file_path: str, output_coco_file_path: str):
    original_coco = load_coco_annotations(input_coco_file_path)
    merged_boxes = process_coco_annotations(original_coco["annotations"])

    print(f"Number of original boxes: {len(original_coco['annotations'])}")
    print(f"Number of merged line boxes: {len(merged_boxes)}")

    merged_coco = create_merged_coco_annotations(original_coco, merged_boxes)
    save_coco_annotations(merged_coco, output_coco_file_path)

    print(f"Merged COCO annotations saved to: {output_coco_file_path}")

if __name__ == "__main__":
    input_coco_file_path = "./files/input/Single Sample Inference/detection.json"
    output_coco_file_path = "./files/input/Single Sample Inference/merged_coco_result.json"
    main(input_coco_file_path, output_coco_file_path)


In [None]:
import os
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt

# Function to convert COCO bbox to a polygon (rectangle)
def bbox_to_polygon(bbox):
    x, y, width, height = bbox
    return [
        [x, y], 
        [x + width, y], 
        [x + width, y + height], 
        [x, y + height]
    ]

# Core function to visualize the annotations for a single image
def visualize_single_image(json_file, image_folder, image_filename):
    # Load the labels data from the JSON file
    with open(json_file, 'r') as f:
        labels = json.load(f)
    
    # Find the corresponding image data
    image_data = next((img for img in labels['images'] if img['file_name'] == image_filename), None)
    
    if not image_data:
        print(f"Image {image_filename} not found in JSON file.")
        return
    
    image_id = image_data['id']
    
    # Load the image
    image_path = os.path.join(image_folder, image_filename)
    image = cv2.imread(image_path)
    
    if image is None:
        print(f"Failed to load image {image_filename}.")
        return
    
    # Retrieve annotations (bounding boxes) for the given image
    annotations = [ann for ann in labels['annotations'] if ann['image_id'] == image_id]
    
    # Draw the bounding boxes on the image
    for ann in annotations:
        bbox = ann['bbox']
        polygon = bbox_to_polygon(bbox)
        polygon_np = np.array(polygon, np.int32).reshape((-1, 1, 2))
        cv2.polylines(image, [polygon_np], isClosed=True, color=(255, 0, 0), thickness=2)
    
    # Convert the image from BGR (OpenCV format) to RGB (for Matplotlib)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Display the annotated image
    plt.figure(figsize=(20, 12))
    plt.imshow(image_rgb)
    plt.axis('off')  # Hide axis
    plt.show()

# Example usage
json_file = "./files/input/Single Sample Inference/merged_coco_result.json"
image_folder = 'files/input/Single Sample Inference'
image_filename = '0001_front.jpg'
visualize_single_image(json_file, image_folder, image_filename)


## Main

### Pre-requisites

In [6]:
import json
import os
import cv2
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as v2
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
# import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
from dataclasses import dataclass


In [None]:
def load_coco_annotations(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 8)
    denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
    rgb = cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(rgb)

def load_model_and_processor(model_path):
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    return processor, model, device

def process_image(img, bbox, processor, model, device, transform):
    # Crop image based on bounding box
    x, y, w, h = map(int, bbox)
    cropped_img = img.crop((x, y, x+w, y+h))

    # Preprocess
    preprocessed_img = preprocess_image(cropped_img)

    # Transform and move to device
    img_t = transform(preprocessed_img).unsqueeze(0).to(device)

    # Generate text
    with torch.no_grad():
        generated_ids = model.generate(img_t)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return generated_text

def main(coco_file_path, image_folder, model_path):
    # Load COCO annotations
    coco_data = load_coco_annotations(coco_file_path)

    # Load model and processor
    processor, model, device = load_model_and_processor(model_path)

    # Define transform
    transform = v2.Compose([
        v2.Resize((384,384)),
        v2.ToTensor(),
        v2.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ])

    # Process each image
    for image_info in coco_data['images']:
        image_path = os.path.join(image_folder, image_info['file_name'])
        img = Image.open(image_path).convert("RGB")

        # Get annotations for this image
        image_annotations = [ann for ann in coco_data['annotations'] if ann['image_id'] == image_info['id']]

        # Sort annotations from top-left to bottom-right
        image_annotations.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))

        print(f"Processing image: {image_info['file_name']}")

        # # Display the full image
        # plt.figure(figsize=(12, 12))
        # plt.imshow(img)
        # plt.axis('off')
        # plt.title(f"Full Image: {image_info['file_name']}")
        # plt.show()

        # Process each bounding box and print recognized text
        for ann in image_annotations:
            generated_text = process_image(img, ann['bbox'], processor, model, device, transform)
            print(f"Recognized Text: {generated_text}")

        print("\n")

if __name__ == "__main__":
    coco_file_path = output_coco_file_path
    image_folder = "./files/input/Single Sample Inference"
    model_path = "./checkpoints/TrOCR"
    main(coco_file_path, image_folder, model_path)