In [8]:
import cv2
import json
import os
import easyocr

In [30]:
def generate_datumaro_json_with_easyocr(image_paths, output_json_path, language="jp"):
    dataset = {
        "info": {},
        "categories": {
            "label": {
                "labels": [
                    {
                        "name": "text",
                        "parent": "",
                        "attributes": []
                    }
                ],
                "attributes": []
            }
        },
        "items": []
    }

    label_name_to_id = {"text": 0}
    
    #언어 선택 부분 
    reader = easyocr.Reader(language, gpu=True)  # GPU 사용 가능 시 gpu=True로 설정

    for image_id, image_path in enumerate(image_paths):
        image = cv2.imread(image_path)
        h, w, _ = image.shape

        image_name = os.path.basename(image_path)
        item_id = os.path.splitext(image_name)[0]

        # EasyOCR로 텍스트 데이터 추출
        results = reader.readtext(image)

        annotations = []
        annotation_counter = 0  # 어노테이션 ID 카운터

        for bbox, text, confidence in results:
            if confidence < 0.5:
                continue  # 신뢰도가 80% 미만이면 무시

            # bbox 좌표 추출 및 변환
            points = [(float(x), float(y)) for x, y in bbox]
            flattened_points = [coord for point in points for coord in point]

            # 어노테이션 정보 추가
            annotation = {
                "id": annotation_counter,
                "type": "polygon",
                "attributes": {
                    "text": text,
                    "confidence": float(confidence),
                    "occluded": False  # 필요에 따라 변경 가능
                },
                "group": 0,
                "label_id": label_name_to_id["text"],
                "points": flattened_points,
                "z_order": 0
            }
            annotations.append(annotation)
            annotation_counter += 1

        # 이미지 정보 추가
        item = {
            "id": item_id,
            "annotations": annotations,
            "image": {
                "size": [int(h), int(w)],
                "path": image_path  # 필요한 경우 경로 수정
            }
        }
        dataset["items"].append(item)

    # JSON 파일로 저장
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

    print(f"Datumaro 포맷의 JSON 파일이 생성되었습니다: {output_json_path}")

In [None]:
image_folder = '/data/ephemeral/home/cleansing'  # 이미지가 저장된 폴더 경로
image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
output_json_path = 'ocr_vi.json'

In [None]:
generate_datumaro_json_with_easyocr(image_paths, output_json_path, ["vi","en"])

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% CompleteDatumaro 포맷의 JSON 파일이 생성되었습니다: cord_zh.json


# CVAT 포맷 맞추기

In [46]:
import json

task_name = "cord"
plus_name = "images/" + "train/"

json_path = "/data/ephemeral/home/cord_zh.json"

with open(json_path,'r') as f:
    data = json.load(f)

In [47]:
data["categories"]['label']['labels'][0]['name'] = 1
data["categories"]['label']['labels'][0]['name']

1

In [48]:
for d in data['items']:
    file = d['id']
    d['id'] = plus_name + file
    print(d['id'])

images/train/image_5736
images/train/image_3242
images/train/image_683
images/train/image_719
images/train/image_1777
images/train/image_2833
images/train/image_294
images/train/image_2411
images/train/image_95
images/train/image_2132
images/train/image_712
images/train/image_4210
images/train/image_497
images/train/image_2180
images/train/image_1219
images/train/image_761
images/train/image_4
images/train/image_3514
images/train/image_564
images/train/image_993
images/train/image_2026
images/train/image_4356
images/train/image_421
images/train/image_4261
images/train/image_3141
images/train/image_4331
images/train/image_1227
images/train/image_4896
images/train/image_5720
images/train/image_5631
images/train/image_1384
images/train/image_1758
images/train/image_1637
images/train/image_3690
images/train/image_159
images/train/image_3081
images/train/image_4876
images/train/image_1381
images/train/image_3204
images/train/image_2094
images/train/image_5233
images/train/image_1820
images/

In [49]:
with open("syzh.json",'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)