In [16]:
import json
import os
import random
from collections import defaultdict, Counter

def split_coco_dataset_balanced(json_path, output_dir, split_ratio=0.95):
    # json 파일 로드
    with open(json_path, 'r') as f:
        coco_data = json.load(f)

    # 각 이미지가 포함하는 클래스 집합을 추적
    image_id_to_categories = defaultdict(set)
    image_id_to_annotations = defaultdict(list)

    for ann in coco_data['annotations']:
        image_id = ann['image_id']
        category_id = ann['category_id']
        image_id_to_categories[image_id].add(category_id)
        image_id_to_annotations[image_id].append(ann)

    # 클래스마다 몇 개의 이미지가 있는지 카운트
    class_to_images = defaultdict(set)
    for image_id, categories in image_id_to_categories.items():
        for category_id in categories:
            class_to_images[category_id].add(image_id)

    # 각 클래스의 이미지 빈도
    class_counts = {category_id: len(image_ids) for category_id, image_ids in class_to_images.items()}
    print("Class counts before splitting:", class_counts)

    # 이미지와 포함된 클래스를 추적하는 리스트
    images_with_classes = [(img, image_id_to_categories[img['id']]) for img in coco_data['images']]
    
    # 이미지 데이터를 무작위로 섞음
    random.shuffle(images_with_classes)

    # 클래스 빈도를 추적하며 균등하게 이미지 분배
    train_images, val_images = [], []
    train_class_counts = Counter()
    val_class_counts = Counter()

    for img, categories in images_with_classes:
        if all(train_class_counts[cat] / (class_counts[cat] + 1e-5) < split_ratio for cat in categories):
            train_images.append(img)
            for cat in categories:
                train_class_counts[cat] += 1
        else:
            val_images.append(img)
            for cat in categories:
                val_class_counts[cat] += 1

    # 이미지 ID에 맞는 어노테이션 분리
    train_annotations = [ann for img in train_images for ann in image_id_to_annotations[img['id']]]
    val_annotations = [ann for img in val_images for ann in image_id_to_annotations[img['id']]]

    # 기존의 info, licenses, categories 정보는 그대로 복사
    info = coco_data.get('info', {})
    licenses = coco_data.get('licenses', [])
    categories = coco_data.get('categories', [])

    # 새로운 COCO 형식의 json 데이터 생성 (train과 val)
    train_data = {
        'info': info,
        'licenses': licenses,
        'images': train_images,
        'annotations': train_annotations,
        'categories': categories
    }

    val_data = {
        'info': info,
        'licenses': licenses,
        'images': val_images,
        'annotations': val_annotations,
        'categories': categories
    }

    # 결과를 json 파일로 저장
    os.makedirs(output_dir, exist_ok=True)

    train_output_path = os.path.join(output_dir, f'train_{int(split_ratio * 100)}.json')
    val_output_path = os.path.join(output_dir, f'val_{int((1 - split_ratio) * 100)}.json')

    with open(train_output_path, 'w') as f:
        json.dump(train_data, f)

    with open(val_output_path, 'w') as f:
        json.dump(val_data, f)

    # 클래스가 균등하게 분포되었는지 확인
    print("Class counts after splitting:")
    print("Train class counts:", dict(train_class_counts))
    print("Validation class counts:", dict(val_class_counts))

    print(f"Train set: {len(train_images)} images, {len(train_annotations)} annotations")
    print(f"Validation set: {len(val_images)} images, {len(val_annotations)} annotations")
    
split_coco_dataset_balanced('../../dataset/train.json', '../../dataset', split_ratio=0.80)
split_coco_dataset_balanced('../../dataset/train.json', '../../dataset', split_ratio=0.95)

Class counts before splitting: {0: 2105, 3: 598, 4: 340, 5: 1369, 7: 1893, 2: 642, 6: 512, 1: 1714, 9: 229, 8: 46}
Class counts after splitting:
Train class counts: {3: 479, 5: 1096, 0: 1685, 6: 410, 7: 1515, 9: 184, 2: 514, 1: 1372, 4: 273, 8: 37}
Validation class counts: {8: 9, 0: 420, 7: 378, 9: 45, 4: 67, 1: 342, 2: 128, 5: 273, 3: 119, 6: 102}
Train set: 3915 images, 18351 annotations
Validation set: 968 images, 4793 annotations
Class counts before splitting: {0: 2105, 3: 598, 4: 340, 5: 1369, 7: 1893, 2: 642, 6: 512, 1: 1714, 9: 229, 8: 46}
Class counts after splitting:
Train class counts: {1: 1629, 0: 2000, 6: 487, 7: 1799, 3: 569, 2: 610, 5: 1301, 9: 218, 4: 324, 8: 44}
Validation class counts: {4: 16, 5: 68, 7: 94, 1: 85, 6: 25, 2: 32, 0: 105, 3: 29, 9: 11, 8: 2}
Train set: 4656 images, 22019 annotations
Validation set: 227 images, 1125 annotations


In [2]:
import json
import os
import random
from collections import defaultdict

def split_coco_dataset_ni(json_path, output_dir, split_ratio=0.95):
    # json 파일 로드
    with open(json_path, 'r') as f:
        coco_data = json.load(f)

    # 이미지와 어노테이션 매핑 딕셔너리 생성
    image_id_to_annotations = defaultdict(list)
    for ann in coco_data['annotations']:
        image_id_to_annotations[ann['image_id']].append(ann)

    # 이미지를 무작위로 섞고, train/val으로 분할
    images = coco_data['images']
    random.shuffle(images)
    split_index = int(len(images) * split_ratio)

    train_images = images[:split_index]
    val_images = images[split_index:]

    # 이미지 ID에 맞는 어노테이션을 분리
    train_annotations = [ann for img in train_images for ann in image_id_to_annotations[img['id']]]
    val_annotations = [ann for img in val_images for ann in image_id_to_annotations[img['id']]]

    # 기존의 info, licenses, categories 정보는 그대로 복사
    info = coco_data.get('info', {})
    licenses = coco_data.get('licenses', [])
    categories = coco_data.get('categories', [])

    # 새로운 COCO 형식의 json 데이터 생성 (train과 val)
    train_data = {
        'info': info,
        'licenses': licenses,
        'images': train_images,
        'annotations': train_annotations,
        'categories': categories
    }

    val_data = {
        'info': info,
        'licenses': licenses,
        'images': val_images,
        'annotations': val_annotations,
        'categories': categories
    }

    # 결과를 json 파일로 저장
    os.makedirs(output_dir, exist_ok=True)

    train_output_path = os.path.join(output_dir, f'train_{int(split_ratio * 100)}_ni.json')
    val_output_path = os.path.join(output_dir, f'val_{int((1 - split_ratio) * 100)}_ni.json')

    with open(train_output_path, 'w') as f:
        json.dump(train_data, f)

    with open(val_output_path, 'w') as f:
        json.dump(val_data, f)

    print(f"Train set: {len(train_images)} images, {len(train_annotations)} annotations")
    print(f"Validation set: {len(val_images)} images, {len(val_annotations)} annotations")

split_coco_dataset_ni('../../dataset/train.json', '../../dataset', split_ratio=0.95)
split_coco_dataset_ni('../../dataset/train.json', '../../dataset', split_ratio=0.8)

Train set: 4638 images, 21939 annotations
Validation set: 245 images, 1205 annotations
Train set: 3906 images, 18479 annotations
Validation set: 977 images, 4665 annotations


In [6]:
import json

def count_unique_image_ids(json_path):
    # JSON 파일 로드
    with open(json_path, 'r') as f:
        coco_data = json.load(f)

    # 고유한 image_id를 저장할 집합
    unique_image_ids = set()

    # images 리스트에서 image_id를 수집
    for image in coco_data['images']:
        unique_image_ids.add(image['id'])

    # 고유한 image_id의 수를 출력
    print(f"Unique image_ids count: {len(unique_image_ids)}")

# 사용 예시
count_unique_image_ids('../../dataset/train_80.json')

Unique image_ids count: 4327
