## 기존 ufo json파일 -> coco

In [17]:
from typing import Dict
import json
import datetime
import os

now = datetime.datetime.now()
now = now.strftime('%Y-%m-%d %H:%M:%S')

input_path = '../data/medical/ufo/divided_train.json'
# input_path = '../../data/medical/ufo/train.json'
output_path = '../data/medical/ufo/train_coco.json'

In [18]:
info = {
    'year': 2024,
    'version': '1.0',
    'description': 'OCR Competition Data',
    'contributor': 'Naver Boostcamp',
    'url': 'https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000273/data/data.tar.gz',
    'date_created': now
}
licenses = {
    'id': '1',
    'name': 'For Naver Boostcamp Competition',
    'url': None
}
categories = [{
    'id': 1,
    'name': 'word'
}]

In [19]:
def ufo_to_coco(file: Dict, output_path: str) -> None:
    img_id = 1 #COCO는 1부터 시작
    annotation_id = 1 #COCO는 1부터 시작
    images = []
    annotations = []
    for fname, data in file.items():
        image = {
            "id": img_id,
            "width": data['img_w'],
            "height": data['img_h'],
            "file_name": fname,
            "license": 1,
            "flickr_url": None,
            "coco_url": None,
            "date_captured": now
        }
        images.append(image)
        for anno_id, annotation in data['words'].items():
            if annotation['illegibility'] == True:
                continue
            min_x = min(item[0] for item in annotation['points'])
            min_y = min(item[1] for item in annotation['points'])
            max_x = max(item[0] for item in annotation['points'])
            max_y = max(item[1] for item in annotation['points'])
            width = max_x - min_x
            height = max_y - min_y
            tags = annotation['tags']

            coco_annotation = {
                "id": annotation_id,
                "image_id": img_id,
                "category_id": 1,
                "segmentation": [[value for sublist in annotation['points'] for value in sublist]],
                "area": width * height,
                "bbox": [min_x, min_y, width, height],
                "iscrowd": 0,
                'tags' : tags
            }
            annotations.append(coco_annotation)
            annotation_id += 1
        img_id += 1
    coco = {
        'info' : info,
        'images' : images,
        'annotations' : annotations,
        'licenses' : licenses,
        'categories' : categories
    }
    with open(output_path, 'w') as f:
        json.dump(coco, f, indent=4)

In [20]:
with open(input_path, 'r') as f:
    file = json.load(f)
ufo_to_coco(file['images'], output_path)

## 기존 파일 coco -> ufo

## 옮겨둔 이미지와 일치하는 json파일 가져오기

In [24]:
import os

folder_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/OCR/img'  # 폴더 경로
files_ext = os.listdir(folder_path)  # 폴더 내의 파일 목록
# 확장자를 제외한 파일명만 추출
img_files = [os.path.splitext(file)[0] for file in files_ext]
len(img_files)

54

In [25]:
import os
import shutil

# json 파일이 있는 폴더 경로
json_source_folder = "../data/medical/OCR/01.라벨링데이터(Json)"

# 이동시킬 폴더
destination_json_folder = "../data/medical/OCR/json"

# json 파일을 복사하며 이미지 파일과 동일한 이름인 경우에만 복사
for root, dirs, files in os.walk(json_source_folder):
    for file in files:
        if file.endswith(".json"):
            source_file_path = os.path.join(root, file)
            destination_json_name = os.path.splitext(file)[0]  # 확장자 제외한 파일 이름만 가져오기
            destination_json_path = os.path.join(destination_json_folder, file)
            if destination_json_name in img_files:
                shutil.copy(source_file_path, destination_json_path)

In [26]:
json_folder_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/OCR/json'  # 폴더 경로
json_files_ext = os.listdir(json_folder_path)  # 폴더 내의 파일 목록
len(json_files_ext)

56

## custom -> coco json파일 합치기

In [5]:
info = {
    'year': 2024,
    'version': '1.0',
    'description': 'OCR Competition Data',
    'contributor': 'Naver Boostcamp',
    'url': 'https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000273/data/data.tar.gz',
}
licenses = {
    'id': '1',
    'name': 'For Naver Boostcamp Competition',
    'url': None
}
categories = [{
    'id': 1,
    'name': 'word'
}]

In [6]:
import json
import os

# JSON 파일이 있는 폴더 경로
json_folder = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/OCR/json/'
output_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/ufo/new_json.json'

# 기존 정보
info = {
    'year': 2024,
    'version': '1.0',
    'description': 'OCR Competition Data',
    'contributor': 'Naver Boostcamp',
    'url': 'https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000273/data/data.tar.gz',
}
licenses = {
    'id': '1',
    'name': 'For Naver Boostcamp Competition',
    'url': None
}
categories = [{'id': 1, 'name': 'word'}]

# COCO 데이터 초기화
img_id = 1
annotation_id = 1
images = []
annotations = []

for file_name in os.listdir(json_folder):
    if file_name.endswith('.json'):
        input_path = os.path.join(json_folder, file_name)

        with open(input_path, 'r') as f:
            file = json.load(f)
        image = {
            'id': img_id,
            'width': file['images'][0]['image.width'],
            'height': file['images'][0]['image.height'],
            'file_name': file['images'][0]["image.file.name"],
            "license": 1,
            "flickr_url": None,
            "coco_url": None,
            'data_captured': file['images'][0]["image.create.time"]
        }
        images.append(image)

        for ann_info in file['annotations']:
            min_x = ann_info["annotation.bbox"][0]
            min_y = ann_info["annotation.bbox"][1]
            width = ann_info["annotation.bbox"][2]
            height = ann_info["annotation.bbox"][3]

            segmentation = [
                            [min_x, min_y, min_x + width, min_y, min_x + width, min_y + height, min_x, min_y + height]
                            ]

            coco_annotation = {
                "id": annotation_id,
                "image_id": img_id,
                "category_id": 1,
                "segmentation": segmentation,
                "area": width * height,
                "bbox": [min_x, min_y, width, height],
                "iscrowd": 0,
                'tags' : ['Auto']
            }
            annotations.append(coco_annotation)
            annotation_id += 1

        img_id += 1

# 모든 데이터를 COCO 포맷으로 합치기
coco = {
    'info': info,
    'images': images,
    'annotations': annotations,
    'licenses': licenses,  # 리스트로 변환
    'categories': categories
}

# JSON 파일로 저장
with open(output_path, 'w') as f:
    json.dump(coco, f, indent=4)

# KeyError 0 나면, new_json.json가 이미 만들어져 있는 거임.

# coco -> ufo

In [11]:
from typing import Dict
import json
import datetime


now = datetime.datetime.now()
now = now.strftime('%Y-%m-%d %H:%M:%S')

input_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/ufo/new_json.json'
output_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/OCR/json/_new_json.json'

In [12]:
ufo = {
    'images': {}
}

In [13]:
def coco_bbox_to_ufo(bbox):
    min_x, min_y, width, height = bbox
    return [
        [min_x, min_y],
        [min_x + width, min_y],
        [min_x + width, min_y + height],
        [min_x, min_y + height]
    ]

def coco_to_ufo(file: Dict, output_path: str) -> None:
    anno_id = 1
    for annotation in file['annotations']:
        file_info = file['images'][int(annotation['image_id'])-1]
        image_name = file_info['file_name']
        if image_name not in ufo['images']:
            anno_id = 1
            ufo['images'][image_name] = {
                "paragraphs": {},
                "words": {},
                "chars": {},
                "img_w": file_info["width"],
                "img_h": file_info["height"],
                "tags": ["autoannotated"],
                "relations": {},
                "annotation_log": {
                    "worker": "",
                    "timestamp": now,
                    "tool_version": "LabelMe or CVAT",
                    "source": None
                    },
                "license_tag": {
                    "usability": True,
                    "public": False,
                    "commercial": True,
                    "type": None,
                    "holder": "Upstage"
                    }
                }
            
            # anno_id = 1
        ufo['images'][image_name]['words'][str(anno_id).zfill(4)] = {
            "transcription": "",
            "points":  coco_bbox_to_ufo(annotation["bbox"]),
            "orientation": "Horizontal",
            "language": None,
            "tags": 'Auto',
            "confidence": None,
            "illegibility": False
        }
        anno_id += 1

    with open(output_path, "w") as f:
        json.dump(ufo, f, indent=4)

In [14]:
with open(input_path, 'r') as f:
    file = json.load(f)
coco_to_ufo(file, output_path)

## 기존 json 파일과 합치기

In [33]:
# 기존 json
original_json_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/ufo/divided_train.json'
# custom_json파일
made_json_path = '/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/OCR/json/_new_json.json'

In [34]:
import json
with open(original_json_path, 'r') as f:
    file_1 = json.load(f)
with open(made_json_path, 'r') as f:
    file_2 = json.load(f)

In [36]:
output_path ='/data/ephemeral/home/level2-cv-datacentric-cv-10/data/medical/ufo/_newest_.json'

combined_images = {}
combined_images = {**file_1['images'], **file_2['images']}

combined_json = {
    'images': combined_images
}
with open(output_path, "w") as f:
    json.dump(combined_json, f, indent=4)

In [37]:
len(file_1['images'])

80

In [38]:
len(file_2['images'])

54

In [39]:
len(combined_json['images'])

134

## 외부 json 파일 합치기