In [2]:
import cv2
import numpy as np
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

def extract_text_from_boxes(boxes, image, trocr_processor, trocr_model):
    extracted_text = []

    for box in boxes:
        # Extract the region within the bounding box
        x1, y1, x2, y2 = (
            int(box["box"]["x1"]),
            int(box["box"]["y1"]),
            int(box["box"]["x2"]),
            int(box["box"]["y2"]),
        )

        # Crop the bounding box area from the original image
        cropped_image = image[y1:y2, x1:x2]

        # Convert the cropped image to PIL format
        cropped_pil_image = Image.fromarray(cropped_image)

        # Preprocess the image for TrOCR
        pixel_values = trocr_processor(cropped_pil_image, return_tensors="pt").pixel_values

        # Generate text using TrOCR
        generated_ids = trocr_model.generate(pixel_values)
        generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        extracted_text.append(generated_text)

    return extracted_text

def main():
    # Load the YOLO model
    yolo_model = YOLO("best.pt")

    # Load the TrOCR model
    trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    input_path = 'images/handwritten7.png'

    # Run inference using the YOLO model
    results = yolo_model(input_path)
    image = cv2.imread(input_path)

    # Convert the results to JSON format
    results = results[0].tojson()
    result = json.loads(results)

    # Extract text from bounding boxes and print
    extracted_text = extract_text_from_boxes(result, image, trocr_processor, trocr_model)

    # Print the extracted text
    for i, text in enumerate(extracted_text):
        print(f"Text from Bounding Box {i + 1}: {text}")

if __name__ == "__main__":
    main()

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

image 1/1 c:\Users\s.j\Desktop\Ali Asghar\Automatic-Grading\POC\Line Extractor\images\handwritten7.png: 480x800 4 lines, 402.0ms
Speed: 4.0ms preprocess, 402.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 800)


Text from Bounding Box 1: " our team for this project. " it owns kills will "
Text from Bounding Box 2: rainly be survived assistant firm looking -
Text from Bounding Box 3: Welcome onboard. I'm very excited to have you
Text from Bounding Box 4: ward to seeing what you come up with


In [13]:
import cv2
import numpy as np
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

# Define a function to sort and filter bounding boxes
def sort_and_filter_boxes(boxes):
    # Sort boxes based on their y-axis attribute
    sorted_boxes = sorted(boxes, key=lambda x: x["box"]["y1"])

    # Filter boxes based on confidence (e.g., keep boxes with confidence > 0.5)
    # filtered_boxes = [box for box in sorted_boxes if box["confidence"] > 0.5]

    return sorted_boxes

def extract_text_from_boxes(boxes, image, trocr_processor, trocr_model):
    extracted_text = []

    for box in boxes:
        # Extract the region within the bounding box
        x1, y1, x2, y2 = (
            int(box["box"]["x1"]),
            int(box["box"]["y1"]),
            int(box["box"]["x2"]),
            int(box["box"]["y2"]),
        )

        # Crop the bounding box area from the original image
        cropped_image = image[y1:y2, x1:x2]

        # Convert the cropped image to PIL format
        cropped_pil_image = Image.fromarray(cropped_image)

        # Preprocess the image for TrOCR
        pixel_values = trocr_processor(cropped_pil_image, return_tensors="pt").pixel_values

        # Generate text using TrOCR
        generated_ids = trocr_model.generate(pixel_values)
        generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        extracted_text.append(generated_text)

    return extracted_text

def main():
    # Load the YOLO model
    yolo_model = YOLO("model/bangla+iam.pt")

    # Load the TrOCR model
    trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    input_path = 'images/not_working.jpg'

    # Run inference using the YOLO model
    results = yolo_model(input_path, conf=0.45)
    image = cv2.imread(input_path)

    # Convert the results to JSON format
    results = results[0].tojson()
    result = json.loads(results)

    # Sort and filter bounding boxes
    sorted_and_filtered_boxes = sort_and_filter_boxes(result)

    # Extract text from sorted and filtered bounding boxes
    extracted_text = extract_text_from_boxes(sorted_and_filtered_boxes, image, trocr_processor, trocr_model)

    # Print the extracted text
    for i, text in enumerate(extracted_text):
        print(f"Text from Bounding Box {i + 1}: {text}")

if __name__ == "__main__":
    main()

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

image 1/1 c:\Users\s.j\Desktop\Ali Asghar\Automatic-Grading\POC\Line Extractor\images\not_working.jpg: 768x800 12 0s, 538.0ms
Speed: 9.0ms preprocess, 538.0ms inference, 2.0ms postprocess per image at shape (1, 3, 768, 800)


Text from Bounding Box 1: degree the Germans wife members of eligibility
Text from Bounding Box 2: future, things with state, care of hope. Children
Text from Bounding Box 3: " He'll smell, in this children, the fighting
Text from Bounding Box 4: fabriate which has husband, elected : "
Text from Bounding Box 5: " She does not, want to abandon them.
Text from Bounding Box 6: fate she wind like worn - later cotton
Text from Bounding Box 7: " foods. She, with her all, courage, embraces
Text from Bounding Box 8: jimbabwe, death, but, to bravely face,
Text from Bounding Box 9: " be alive " etc teach her children not
Text from Bounding Box 10: its fight with a clenched fist for not
Text from Bounding Box 11: only like basic need of food but also,
Text from Bounding Box 12: to tell the movement from about


In [18]:
import cv2
import numpy as np
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

class ImgCorrect():
    def __init__(self, img):
        self.img = img
        self.h, self.w, self.channel = self.img.shape
        if self.w <= self.h:
            self.scale = 700 / self.w
            self.img = cv2.resize(self.img, (0, 0), fx=self.scale, fy=self.scale, interpolation=cv2.INTER_NEAREST)
        else:
            self.scale = 700 / self.h
            self.img = cv2.resize(self.img, (0, 0), fx=self.scale, fy=self.scale, interpolation=cv2.INTER_NEAREST)
        self.gray = cv2.cvtColor(self.img, cv2.COLOR_BGR2GRAY)

    def img_lines(self):
        ret, binary = cv2.threshold(self.gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        binary = cv2.dilate(binary, kernel)
        edges = cv2.Canny(binary, 50, 200)

        self.lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100, minLineLength=100, maxLineGap=20)

        if self.lines is None:
            return None

        lines1 = self.lines[:, 0, :]
        imglines = self.img.copy()
        for x1, y1, x2, y2 in lines1[:]:
            cv2.line(imglines, (x1, y1), (x2, y2), (0, 255, 0), 3)
        return imglines

    def search_lines(self):
        lines = self.lines[:, 0, :]
        number_inexist_k = 0
        sum_pos_k45 = number_pos_k45 = 0
        sum_pos_k90 = number_pos_k90 = 0
        sum_neg_k45 = number_neg_k45 = 0
        sum_neg_k90 = number_neg_k90 = 0
        sum_zero_k = number_zero_k = 0

        for x in lines:
            if x[2] == x[0]:
                number_inexist_k += 1
                continue
            degree = np.degrees(np.arctan((x[3] - x[1]) / (x[2] - x[0])))
            if 0 < degree < 45:
                number_pos_k45 += 1
                sum_pos_k45 += degree
            if 45 <= degree < 90:
                number_pos_k90 += 1
                sum_pos_k90 += degree
            if -45 < degree < 0:
                number_neg_k45 += 1
                sum_neg_k45 += degree
            if -90 < degree <= -45:
                number_neg_k90 += 1
                sum_neg_k90 += degree
            if x[3] == x[1]:
                number_zero_k += 1

        max_number = max(number_inexist_k, number_pos_k45, number_pos_k90, number_neg_k45, number_neg_k90, number_zero_k)

        if max_number == number_inexist_k:
            return 90
        if max_number == number_pos_k45:
            return sum_pos_k45 / number_pos_k45
        if max_number == number_pos_k90:
            return sum_pos_k90 / number_pos_k90
        if max_number == number_neg_k45:
            return sum_neg_k45 / number_neg_k45
        if max_number == number_neg_k90:
            return sum_neg_k90 / number_neg_k90
        if max_number == number_zero_k:
            return 0

    def rotate_image(self, degree):
        if -45 <= degree <= 0:
            degree = degree
        if -90 <= degree < -45:
            degree = 90 + degree
        if 0 < degree <= 45:
            degree = degree
        if 45 < degree <= 90:
            degree = degree - 90

        height, width = self.img.shape[:2]
        heightNew = int(width * np.abs(np.sin(np.radians(degree))) + height * np.abs(np.cos(np.radians(degree))))
        widthNew = int(height * np.abs(np.sin(np.radians(degree))) + width * np.abs(np.cos(np.radians(degree))))

        matRotation = cv2.getRotationMatrix2D((width / 2, height / 2), degree, 1)
        matRotation[0, 2] += (widthNew - width) / 2
        matRotation[1, 2] += (heightNew - height) / 2

        imgRotation = cv2.warpAffine(self.img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255))

        bg_color = [255, 255, 255]
        pad_image_rotate = cv2.warpAffine(self.img, matRotation, (widthNew, heightNew), borderValue=(0, 255, 0))

        return pad_image_rotate

def dskew(line_path, img):
    img_loc = line_path + img
    im = cv2.imread(img_loc)
    bg_color = [255, 255, 255]
    pad_img = cv2.copyMakeBorder(im, 100, 100, 100, 100, cv2.BORDER_CONSTANT, value=bg_color)
    imgcorrect = ImgCorrect(pad_img)
    lines_img = imgcorrect.img_lines()

    if lines_img is None:
        rotate = imgcorrect.rotate_image(0)
    else:
        degree = imgcorrect.search_lines()
        rotate = imgcorrect.rotate_image(degree)

    return rotate

# Define a function to sort and filter bounding boxes
def sort_and_filter_boxes(boxes):
    # Sort boxes based on their y-axis attribute
    sorted_boxes = sorted(boxes, key=lambda x: x["box"]["y1"])

    return sorted_boxes

def extract_text_from_boxes(boxes, image, trocr_processor, trocr_model):
    extracted_text = []

    for box in boxes:
        # Extract the region within the bounding box
        x1, y1, x2, y2 = (
            int(box["box"]["x1"]),
            int(box["box"]["y1"]),
            int(box["box"]["x2"]),
            int(box["box"]["y2"]),
        )

        # Crop the bounding box area from the original image
        cropped_image = image[y1:y2, x1:x2]

        # Convert the cropped image to PIL format
        cropped_pil_image = Image.fromarray(cropped_image)

        # Preprocess the image for TrOCR
        pixel_values = trocr_processor(cropped_pil_image, return_tensors="pt").pixel_values

        # Generate text using TrOCR
        generated_ids = trocr_model.generate(pixel_values)
        generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        extracted_text.append(generated_text)

    return extracted_text

def main():
    # Load the YOLO model
    yolo_model = YOLO("model/bangla+iam.pt")

    # Load the TrOCR model
    trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    input_path = 'images/not_working.jpg'

    # Run inference using the YOLO model
    results = yolo_model(input_path)
    image = cv2.imread(input_path)

    # Convert the results to JSON format
    results = results[0].tojson()
    result = json.loads(results)

    # Sort and filter bounding boxes
    sorted_and_filtered_boxes = sort_and_filter_boxes(result)

    # Extract text from sorted and filtered bounding boxes
    extracted_text = extract_text_from_boxes(sorted_and_filtered_boxes, image, trocr_processor, trocr_model)

    # Print the extracted text
    for i, text in enumerate(extracted_text):
        print(f"Text from Bounding Box {i + 1}: {text}")

    # Skew correction using ImgCorrect class
    line_path = input_path.split('/')[0] + "/"  # Change this to your desired directory
    img_name = input_path.split('/')[1]  # Change this to the image you want to correct

    # Perform skew correction
    corrected_image = dskew(line_path, img_name)

    # Display the corrected image
    cv2.imshow('Corrected Image', corrected_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

image 1/1 c:\Users\s.j\Desktop\Ali Asghar\Automatic-Grading\POC\Line Extractor\images\not_working.jpg: 768x800 21 0s, 547.0ms
Speed: 7.0ms preprocess, 547.0ms inference, 1.0ms postprocess per image at shape (1, 3, 768, 800)


Text from Bounding Box 1: degree the Germans wife members of eligibility
Text from Bounding Box 2: future, things with state, care of hope. Children
Text from Bounding Box 3: gate, believe that state care of huge child
Text from Bounding Box 4: " What thou know it of the twentful thing caught
Text from Bounding Box 5: " He'll smell, in this children, the fighting
Text from Bounding Box 6: fabriate which has husband, elected : "
Text from Bounding Box 7: " She does not, want to abandon them.
Text from Bounding Box 8: fate she wind like worn - later cotton
Text from Bounding Box 9: " foods. She, with her all, courage, embraces
Text from Bounding Box 10: flipped and the twelfth tier children's most notorious
Text from Bounding Box 11: " Life and " teach her children. " not to
Text from Bounding Box 12: jimbabwe, death, but, to bravely face,
Text from Bounding Box 13: flifle and its struggles that " is why
Text from Bounding Box 14: she says that she would continue to
Text from Bounding Bo