# Preprocessing for pcb-oriented-detection

From: https://www.kaggle.com/datasets/yuyi1005/pcb-oriented-detection

Contains oriented bounding boxes, so we need to convert them to regular bounding boxes.

The file performs the following pre-processing steps:
 - Finds the tightest regular bounding boxes that fit the oriented bounding boxes and saves this.
 - The images are (also) dark here, and so we perform some basic color correction, by clipping the values in all the color channels to be less than it's 97.5 percentile.

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
import xml.etree.ElementTree as ET
from shutil import copyfile
import os
import os.path as path
import shutil
import pathlib
from pathlib import Path
from tqdm.std import tqdm
import random
import json
import random
import glob
import cv2
import regex as re
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.axes import Axes
import seaborn as sns
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
import enum
from enum import Enum, auto
from matplotlib.offsetbox import AnnotationBbox, AuxTransformBox

import sys

sys.path.insert(0, "..")
from yolo_visualize import (
    sample_yolo_dataset,
    get_class_colors,
    show_image_with_bounding_box,
    label_str_to_num,
)

In [2]:
CLASSES = (
    "C",
    "J",
    "RS",
    "CE",
    "IC-SOT23",
    "IC-SOP",
    "IC-TO252",
    "IC-SOT223",
    "D",
    "JW",
    "X",
    "R",
    "IC-BGA",
    "IC-QFN",
    "SW",
    "SW-S",
    "IC-SOT235",
    "IC-SOT89",
    "IC-QFP",
    "IC-SOT234",
    "LED",
    "IC-SON",
    "CA",
    "LR",
    "IC-SOT236",
    "JN-FFC",
    "RN-N",
    "JN-XHH",
    "CN",
    "RN",
    "JN-DF",
    "JN-DM",
    "JN-XHV",
    "IC",
    "P",
    "DC",
    "LA",
    "LB",
    "X-HC49",
    "JN",
    "F",
)

CLASS_NAME_TO_INT = {v: i for i, v in enumerate(CLASSES)}

In [3]:
CLASS_NAME_TO_INT

{'C': 0,
 'J': 1,
 'RS': 2,
 'CE': 3,
 'IC-SOT23': 4,
 'IC-SOP': 5,
 'IC-TO252': 6,
 'IC-SOT223': 7,
 'D': 8,
 'JW': 9,
 'X': 10,
 'R': 11,
 'IC-BGA': 12,
 'IC-QFN': 13,
 'SW': 14,
 'SW-S': 15,
 'IC-SOT235': 16,
 'IC-SOT89': 17,
 'IC-QFP': 18,
 'IC-SOT234': 19,
 'LED': 20,
 'IC-SON': 21,
 'CA': 22,
 'LR': 23,
 'IC-SOT236': 24,
 'JN-FFC': 25,
 'RN-N': 26,
 'JN-XHH': 27,
 'CN': 28,
 'RN': 29,
 'JN-DF': 30,
 'JN-DM': 31,
 'JN-XHV': 32,
 'IC': 33,
 'P': 34,
 'DC': 35,
 'LA': 36,
 'LB': 37,
 'X-HC49': 38,
 'JN': 39,
 'F': 40}

In [4]:
LOAD_DATA_DIR = Path("./pcb-oriented-detection/")
SAVE_DATA_DIR = Path("./pcb-un-oriented-detection/")

In [5]:
def sane_mkdir(v):
    Path(v).mkdir(parents=True, exist_ok=True)

In [6]:
# Temporary label storage location
sane_mkdir(LOAD_DATA_DIR / "labels")
sane_mkdir(LOAD_DATA_DIR / "images")

In [7]:
# clear_old_data = True
clear_old_data = False
if clear_old_data and path.exists(SAVE_DATA_DIR):
    shutil.rmtree(SAVE_DATA_DIR)
sane_mkdir(SAVE_DATA_DIR)

In [8]:
DATASET_GROUPS = ["train", "test", "valid"]
for g in DATASET_GROUPS:
    sane_mkdir(SAVE_DATA_DIR / g)
    sane_mkdir(SAVE_DATA_DIR / g / "images")
    sane_mkdir(SAVE_DATA_DIR / g / "labels")

In [9]:
# Define metadata for dataset
with open(SAVE_DATA_DIR / "data.yaml", "w") as f:
    f.write(
        f"""train: ../train/images
val: ../valid/images
test: ../test/images

nc: {len(CLASSES)}
names: [{",".join(f"'{s}'" for s in CLASSES)}]
"""
    )

In [11]:
images_files = list(os.listdir(LOAD_DATA_DIR / "images_old"))
rng = random.Random(x=42)
rng.shuffle(images_files)
train_end_idx = int(len(images_files) * 0.7)
test_end_idx = int(len(images_files) * (0.7 + 0.15))
train_files = images_files[0:train_end_idx]
test_files = images_files[train_end_idx:test_end_idx]
val_files = images_files[test_end_idx:]
vs = [train_files, test_files, val_files]



for g_id, group in enumerate(DATASET_GROUPS):
    print(group)
    for image_file_rel in tqdm(vs[g_id]):
        image_file = LOAD_DATA_DIR / "images_old" / image_file_rel
        text_file = (LOAD_DATA_DIR / "annfiles" / image_file_rel).with_suffix(".txt")
        img = cv2.imread(image_file)

        annotations = []

        with open(text_file, "r") as f:
            contents = f.read()
            annotations.extend(line.split() for line in contents.splitlines())


        h, w, _ = img.shape

        percentile_value = 97.5
        whitebalanced = (

            (img * 1.0 / np.percentile(img, percentile_value, axis=(0, 1))).clip(0, 1)

            * 255

        ).astype(np.uint8)

        j = Image.fromarray(whitebalanced)

        j.save(SAVE_DATA_DIR / group / "images" / image_file_rel)


        with open(SAVE_DATA_DIR / group / "labels" / text_file.name, "w") as f:
            for annotation in annotations:
                class_id = CLASS_NAME_TO_INT[annotation[-2]]
                coordinates = [float(i) for i in annotation[:-2]]
                coordsX = coordinates[
                    ::2
                ]  # get every other element starting from the first (x coordinates)
                coordsY = coordinates[
                    1::2
                ]  # get every other element starting from the second (y coordinates)

                minX = min(coordsX)
                maxX = max(coordsX)
                minY = min(coordsY)
                maxY = max(coordsY)


                centerX = ((maxX + minX) / 2) * (1 / w)
                centerY = ((maxY + minY) / 2) * (1 / h)


                boundingWidth = (maxX - minX) * (1 / w)
                boundingHeight = (maxY - minY) * (1 / h)


                f.write(
                    f"{class_id} {centerX} {centerY} {boundingWidth} {boundingHeight}\n"
                )

train


100%|██████████| 133/133 [01:25<00:00,  1.55it/s]


test


100%|██████████| 28/28 [00:18<00:00,  1.55it/s]


valid


100%|██████████| 29/29 [00:17<00:00,  1.64it/s]
