In [2]:
import os
import json
import shutil
from sklearn.model_selection import train_test_split
from pycocotools.coco import COCO
import pandas as pd

통합 데이터를 생성합니다.

In [3]:
#YOLO용으로 데이터 전처리

BASE1 = "./ai06-level1-project/"
BASE2 = "./additional_training_data/"
IMG_DIR1 = os.path.join(BASE1, "train_output")
IMG_DIR2 = os.path.join(BASE2, "train_cleaned")
ANN_FILE1 = os.path.join("./ai06-level1-project/", "train.json")
ANN_FILE2 = os.path.join("./additional_training_data/", "train.json")
TEST_IMG_DIR = os.path.join(BASE1, "test_images")

OUT_DIR = "yolo_dataset"

os.makedirs(os.path.join(OUT_DIR, "images/train"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "images/val"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "labels/train"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "labels/val"), exist_ok=True)

with open(ANN_FILE1, "r", encoding="utf-8") as f:
    dataset1 = json.load(f)

with open(ANN_FILE2, "r", encoding="utf-8") as f:
    dataset2 = json.load(f)

i=pd.DataFrame(dataset1["images"] + dataset2["images"])
i=i.drop_duplicates()
i=i.to_dict(orient='records')

cat=pd.DataFrame(dataset1["categories"] + dataset2["categories"])
cat=cat.drop_duplicates()
cat=cat.to_dict(orient='records')

dataset = {
    "images": i,
    "annotations": dataset1["annotations"] + dataset2["annotations"],
    "categories": cat
}

dataset['categories']

[{'supercategory': 'pill', 'id': 1899, 'name': '보령부스파정 5mg'},
 {'supercategory': 'pill', 'id': 16547, 'name': '가바토파정 100mg'},
 {'supercategory': 'pill', 'id': 19606, 'name': '스토가정 10mg'},
 {'supercategory': 'pill', 'id': 29450, 'name': '레일라정'},
 {'supercategory': 'pill', 'id': 33008, 'name': '신바로정'},
 {'supercategory': 'pill', 'id': 21770, 'name': '라비에트정 20mg'},
 {'supercategory': 'pill', 'id': 27925, 'name': '울트라셋이알서방정'},
 {'supercategory': 'pill', 'id': 24849, 'name': '놀텍정 10mg'},
 {'supercategory': 'pill', 'id': 29344, 'name': '비모보정 500/20mg'},
 {'supercategory': 'pill', 'id': 16550, 'name': '동아가바펜틴정 800mg'},
 {'supercategory': 'pill', 'id': 33207, 'name': '에스원엠프정 20mg'},
 {'supercategory': 'pill', 'id': 2482, 'name': '뮤테란캡슐 100mg'},
 {'supercategory': 'pill', 'id': 3742, 'name': '알드린정'},
 {'supercategory': 'pill', 'id': 12777, 'name': '다보타민큐정 10mg/병'},
 {'supercategory': 'pill', 'id': 13394, 'name': '써스펜8시간이알서방정 650mg'},
 {'supercategory': 'pill', 'id': 12080, 'name': '리렉스펜정 300mg/

In [4]:
len(dataset['images']),len(dataset['annotations']),len(dataset['categories'])

(8465, 24314, 74)

이제 통합 데이터를 바탕으로 YOLO 데이터를 만듭니다.

In [None]:
#YOLO용으로 데이터 전처리

OUT_DIR = "./yolo_dataset"

os.makedirs(os.path.join(OUT_DIR, "images/train"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "images/val"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "labels/train"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "labels/val"), exist_ok=True)

coco = COCO()
coco.dataset = dataset
coco.createIndex()

img_ids = list(coco.imgs.keys())

train_ids, val_ids = train_test_split(img_ids, test_size=0.2, random_state=42)


def convert_to_yolo_bbox(box, img_w, img_h):
    x, y, w, h = box
    cx = (x + w/2) / img_w
    cy = (y + h/2) / img_h
    w /= img_w
    h /= img_h
    return cx, cy, w, h


def process_image(img_id, split="train"):

    img_info = coco.loadImgs(img_id)[0]
    file_name = img_info["file_name"]
    width, height = img_info["width"], img_info["height"]

    src_img_path1 = os.path.join(IMG_DIR1, file_name)
    src_img_path2 = os.path.join(IMG_DIR2, file_name)
    dst_img_path = os.path.join(OUT_DIR, f"images/{split}/{file_name}")

    if os.path.exists(src_img_path1):
        shutil.copy(src_img_path1, dst_img_path)
    elif os.path.exists(src_img_path2):
        shutil.copy(src_img_path2, dst_img_path)
    else:
        print("이미지 없음")
        return

    label_path = os.path.join(OUT_DIR, f"labels/{split}/{file_name.replace('.png', '.txt')}")

    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)

    with open(label_path, "w", encoding="utf-8") as f:
        for ann in anns:
            category_id = ann["category_id"]
            yolo_class = list(coco.cats.keys()).index(category_id)
            bbox = ann["bbox"]
            yolo_box = convert_to_yolo_bbox(bbox, width, height)

            f.write(f"{yolo_class} {' '.join([str(round(v, 6)) for v in yolo_box])}\n")


for img_id in train_ids:
    process_image(img_id, split="train")

for img_id in val_ids:
    process_image(img_id, split="val")

print("YOLO dataset 생성 완료")


yaml_path = os.path.join(OUT_DIR, "data.yaml")
num_classes = len(coco.cats)
names = [coco.cats[k]["name"] for k in sorted(coco.cats.keys())]

with open(yaml_path, "w", encoding="utf-8") as f:
    f.write(f"path: {OUT_DIR}\n")
    f.write("train: images/train\n")
    f.write("val: images/val\n\n")
    f.write(f"nc: {num_classes}\n")
    f.write(f"names: {names}\n")

print("data.yaml 파일 생성 완료")

creating index...
index created!
YOLO dataset 생성 완료
data.yaml 파일 생성 완료
