# End to End Pipeline with YOLO + OCRs

## Load Detector

In [37]:
from ultralytics import YOLO

# Load model
model = YOLO("../assets/carbook.pt")

## Utils function for evaluation

In [97]:
import pandas as pd
import numpy as np
from torchmetrics.text import CharErrorRate

def calculate_cer(predictions: list[str], labels: list[str]) -> list[float]:
    """Return the Character Error Rate (CER) between the predicted and target strings."""
    cer = CharErrorRate()
    cer_val = cer(predictions, labels)
    return cer_val.tolist()

def evaluate(label_df: pd.DataFrame, prediction_df: pd.DataFrame) -> pd.DataFrame:
    # Merge some of the columnof label_df to match with prediction_df
    ## Merge 'plate1' and 'plate2' to 'registration_no'
    label_df["registration_no"] = label_df["plate1"] + " " +label_df["plate2"]
    label_df = label_df.drop(columns=["plate1", "plate2"])
    display(label_df["registration_no"])
    ## Merge 'axles_wheels_no', 'wheels', 'tires' to 'axles_wheels_no'
    ## e.g. 'x x x' -> 'x เพลา x ล้อ ยาง x เส้น'
    label_df["axles_wheels_no"] = label_df["axles_wheels_no"].apply(lambda x: x + " เพลา")
    label_df["wheels"] = label_df["wheels"].apply(lambda x: x + " ล้อ")
    label_df["tires"] = label_df["tires"].apply(lambda x: "ยาง " + x + " เส้น")
    label_df["axles_wheels_no"] = label_df["axles_wheels_no"] + " " + label_df["wheels"] + " " + label_df["tires"]
    label_df = label_df.drop(columns=["wheels", "tires"])
    display(label_df["axles_wheels_no"])
    # Rename some columns of label_df to match with prediction_df
    rename_dict = {
        "province": "car_province",
        "type_car": "vehicle_use",
        "kind": "body_style",
        "num_body": "chassis_number",
        "brand": "manufacturer",
        "num_engine": "engine_number",
    }
    label_df = label_df.rename(columns=rename_dict)
    # Replace any NaN with ''
    label_df = label_df.fillna("")
    prediction_df = prediction_df.fillna("")
    # Replace 'ไม่พบข้อมูล' with ''
    prediction_df["year"] = prediction_df["year"].map(lambda x: x.replace("ไม่พบข้อมูล", ""))
    label_df["year"] = label_df["year"].map(lambda x: x.replace("ไม่พบข้อมูล", ""))
    # Rearrange the columns
    columns_of_interest = [
        'date_of_registration', 'registration_no', 'car_province', 'vehicle_use', 'type', 'body_style',
        'manufacturer', 'model', 'year', 'color', 'chassis_number', 'chassis_location', 'engine_manufacturer',
        'engine_number', 'engine_location', 'fuel_type', 'fuel_tank_number', 'cylinders', 'cubic_capacity',
        'horse_power', 'axles_wheels_no', 'unladen_weight', 'load_capacity', 'gross_weight', 'seats'
    ]
    label_df = label_df[columns_of_interest]
    prediction_df = prediction_df[columns_of_interest]
    # Create index column on both dataframes
    label_df["index"] = range(len(label_df))
    prediction_df["index"] = range(len(prediction_df))
    # Evaluate
    merged_df = pd.merge(label_df, prediction_df, on="index", suffixes=('_annotation', '_prediction'))
    eval_list = []
    for col in columns_of_interest:
        if f"{col}_annotation" in merged_df.columns and f"{col}_prediction" in merged_df.columns:
            avg_cer = np.mean(calculate_cer(merged_df[f"{col}_prediction"], merged_df[f"{col}_annotation"]))
            avg_accuracy = (merged_df[f"{col}_prediction"] == merged_df[f"{col}_annotation"]).mean() * 100
            eval_list.append({
                "column_name": col,
                "cer": avg_cer,
                "accuracy": avg_accuracy
            })
    eval_df = pd.DataFrame(eval_list)
    return eval_df

def post_process_text(text: str) -> str:
    # Remove extra spaces
    text = " ".join(text.split())
    # Remove any leading or trailing spaces
    text = text.strip()
    return text

## Determine dataset directory

Location where the dataset is stored

In [None]:
from pathlib import Path

dataset_dir = Path("../datasets/Data/Srisawad_Dataset_100")
image_paths = list(dataset_dir.glob("*.jpg"))

# Sort just to make it beautiful >.<
image_paths = list(sorted(image_paths))
image_paths[0]

## Get bboxes from YOLO first

In [143]:
yolo_results = model.predict(image_paths, imgsz=640, conf=0.25, half=True, device="cuda")


0: 640x640 1  (cylinders), 1  (unladen_weight), 1  (horse_power), 1  (color), 1  (model), 1 / (load_capacity), 1  (chassis_number), 1  (engine_manufacturer), 1  (manufacturer), 1  (cubic_capacity), 1  (chassis_location), 1  (car_province), 1  (body_style), 1  (type), 1  (date_of_registration), 1  (vehicle_use), 1  (gross_weight), 1  (engine_number), 1   (year), 1  (registration_no), 1  (engine_location), 1  (axles_wheels_no), 1  (fuel_type), 1.6ms
1: 640x640 1  (cylinders), 1  (unladen_weight), 1  (color), 1 / (load_capacity), 1  (chassis_number), 1  (engine_manufacturer), 1  (manufacturer), 1  (cubic_capacity), 1  (chassis_location), 1  (car_province), 1  (body_style), 1  (type), 1  (date_of_registration), 1  (vehicle_use), 1  (gross_weight), 1  (engine_number), 1  (registration_no), 1  (engine_location), 1  (axles_wheels_no), 1  (fuel_type), 1.6ms
2: 640x640 1  (cylinders), 1  (unladen_weight), 1  (horse_power), 1  (color), 1  (model), 1 / (load_capacity), 1  (chassis_number), 1  (e

# Crop textboxes from YOLO results

In [None]:
from PIL import Image

def get_textboxes(result: )
# Get all textbox images
textbox_images = []
for result in yolo_results:
    # Get image
    image = Image.fromarray(result.orig_img)

    # Get bounding boxes, class labels, and scores
    class_names = result.names
    bboxes = result.boxes.data[:, :4].round().cpu().int().numpy().tolist()
    class_predictions = result.boxes.cls.cpu().int().numpy()
    # Crop image
    x1, y1, x2, y2 = bbox
    textbox_image = image.crop((x1, y1, x2, y2))
    textbox_images.append(textbox_image)

## 1. Surya

In [None]:
from surya.model.recognition.model import load_model as load_recognizer
from surya.model.recognition.processor import load_processor as load_recognizer_processor

recognizer = load_recognizer()
recognizer_processor = load_recognizer_processor()

In [None]:
from tqdm import tqdm
from surya.recognition import batch_recognition
from PIL import Image

document_data = []
# Detect bboxes for each document
for image_path in tqdm(yolo):
    result = model.predict(image_path, imgsz=640, conf=0.25, half=True, device="cuda")
    result = result[0]

    # Get image
    image = Image.fromarray(result.orig_img)

    # Get bounding boxes, class labels, and scores
    class_names = result.names
    bboxes = result.boxes.data[:, :4].round().cpu().int().numpy().tolist()
    class_predictions = result.boxes.cls.cpu().int().numpy()

    # Get all textbox images
    textbox_images = []
    for bbox in bboxes:
        # Crop image
        x1, y1, x2, y2 = bbox
        textbox_image = image.crop((x1, y1, x2, y2))
        textbox_images.append(textbox_image)

    # Get all text predictions
    texts = batch_recognition(
        images=textbox_images,
        # Weird, but we need to tell the model that every
        # images is in Thai and English
        languages=[["th", "en"]] * len(textbox_images),
        model=recognizer,
        processor=recognizer_processor,
        batch_size=4
    )[0]

    document_info = {}
    for text, class_prediction in zip(texts, class_predictions):
        # Get class name
        predicted_class = class_names[class_prediction]
        # A little bit of cleaning
        text = " ".join(text.split())  # Remove extra whitespaces
        text = text.strip()  # Remove leading and trailing whitespaces
        # Save the data
        document_info[predicted_class] = text
    
    document_data.append(document_info)

# Fill in missing keys
for document_info in document_data:
    for key in class_names:
        if key not in document_info:
            document_info[key] = ""

In [86]:
import pandas as pd

# Construct the dataframe
prediction_df = pd.DataFrame(document_data)
# Convert YOLO class names to the actual column names
convert_column_name_dict = {
    'จำนวน (cylinders)': 'cylinders',
    'น้ำหนักรถ (unladen_weight)': 'unladen_weight',
    'เลขถังแก๊ส (fuel_tank_number)': 'fuel_tank_number',
    'แรงม้า (horse_power)': 'horse_power',
    'สี (color)': 'color',
    'แบบ (model)': 'model',
    'น้ำหนักบรรทุก/น้ำหนักเพลา (load_capacity)': 'load_capacity',
    'เลขตัวรถ (chassis_number)': 'chassis_number',
    'ยี่ห้อเครื่องยนต์ (engine_manufacturer)': 'engine_manufacturer',
    'ยี่ห้อรถ (manufacturer)': 'manufacturer',
    'ซีซี (cubic_capacity)': 'cubic_capacity',
    'อยู่ที่ (chassis_location)': 'chassis_location',
    'จังหวัด (car_province)': 'car_province',
    'ลักษณะ (body_style)': 'body_style',
    'รย (type)': 'type',
    'วันจดทะเบียน (date_of_registration)': 'date_of_registration',
    'ประเภท (vehicle_use)': 'vehicle_use',
    'น้ำหนักรวม (gross_weight)': 'gross_weight',
    'เลขเครื่องยนต์ (engine_number)': 'engine_number',
    'รุ่นปี คศ (year)': 'year',
    'ที่นั่ง (seats)': 'seats',
    'เลขทะเบียน (registration_no)': 'registration_no',
    'อยู่ที่ (engine_location)': 'engine_location',
    'จำนวนเพลาและล้อ (axles_wheels_no)': 'axles_wheels_no',
    'เชื้อเพลิง (fuel_type)': 'fuel_type'
}

# Rename columns in the dataframe
prediction_df.rename(columns=convert_column_name_dict, inplace=True)

In [None]:
# Evaluate
annotated_df = pd.read_excel('../datasets/Data/annotation_sawad_100.xlsx', dtype=str).fillna("")
annotated_df.drop(columns=["document_id", "document_name"], inplace=True)
eval_df = evaluate(annotated_df, prediction_df)
eval_df

## 2. EasyOCR

In [81]:
from easyocr import Reader

reader = Reader(["th", "en"])

In [None]:
from PIL import Image

document_data = []
# Detect bboxes for each document
for image_path in tqdm(image_paths):
    result = model.predict(image_path, imgsz=640, conf=0.25, half=True, device="cuda")
    result = result[0]

    # Get image
    image = Image.fromarray(result.orig_img)

    # Get bounding boxes, class labels, and scores
    class_names = result.names
    bboxes = result.boxes.data[:, :4].round().cpu().int().numpy().tolist()
    class_predictions = result.boxes.cls.cpu().int().numpy()

    # Get all textbox images
    textbox_images = []
    for bbox in bboxes:
        # Crop image
        x1, y1, x2, y2 = bbox
        textbox_image = image.crop((x1, y1, x2, y2))
        textbox_images.append(textbox_image)

    # Get all text predictions
    texts = [reader.recognize(np.array(textbox)) for textbox in textbox_images]

    document_info = {}
    for text, class_prediction in zip(texts, class_predictions):
        # Get class name
        predicted_class = class_names[class_prediction]
        _, text, _ = text[0]
        # A little bit of cleaning
        text = " ".join(text.split())  # Remove extra whitespaces
        text = text.strip()  # Remove leading and trailing whitespaces
        # Save the data
        document_info[predicted_class] = text
    
    document_data.append(document_info)

# Fill in missing keys
for document_info in document_data:
    for key in class_names:
        if key not in document_info:
            document_info[key] = ""

In [95]:
import pandas as pd

# Construct the dataframe
prediction_df = pd.DataFrame(document_data)
# Convert YOLO class names to the actual column names
convert_column_name_dict = {
    'จำนวน (cylinders)': 'cylinders',
    'น้ำหนักรถ (unladen_weight)': 'unladen_weight',
    'เลขถังแก๊ส (fuel_tank_number)': 'fuel_tank_number',
    'แรงม้า (horse_power)': 'horse_power',
    'สี (color)': 'color',
    'แบบ (model)': 'model',
    'น้ำหนักบรรทุก/น้ำหนักเพลา (load_capacity)': 'load_capacity',
    'เลขตัวรถ (chassis_number)': 'chassis_number',
    'ยี่ห้อเครื่องยนต์ (engine_manufacturer)': 'engine_manufacturer',
    'ยี่ห้อรถ (manufacturer)': 'manufacturer',
    'ซีซี (cubic_capacity)': 'cubic_capacity',
    'อยู่ที่ (chassis_location)': 'chassis_location',
    'จังหวัด (car_province)': 'car_province',
    'ลักษณะ (body_style)': 'body_style',
    'รย (type)': 'type',
    'วันจดทะเบียน (date_of_registration)': 'date_of_registration',
    'ประเภท (vehicle_use)': 'vehicle_use',
    'น้ำหนักรวม (gross_weight)': 'gross_weight',
    'เลขเครื่องยนต์ (engine_number)': 'engine_number',
    'รุ่นปี คศ (year)': 'year',
    'ที่นั่ง (seats)': 'seats',
    'เลขทะเบียน (registration_no)': 'registration_no',
    'อยู่ที่ (engine_location)': 'engine_location',
    'จำนวนเพลาและล้อ (axles_wheels_no)': 'axles_wheels_no',
    'เชื้อเพลิง (fuel_type)': 'fuel_type'
}

# Rename columns in the dataframe
prediction_df.rename(columns=convert_column_name_dict, inplace=True)

In [None]:
# Evaluate
annotated_df = pd.read_excel('../datasets/Data/annotation_sawad_100.xlsx', dtype=str).fillna("")
annotated_df.drop(columns=["document_id", "document_name"], inplace=True)
eval_df = evaluate(annotated_df, prediction_df)
eval_df