In [1]:
import os
import json

In [2]:
def verify_dataset(image_dir, label_dir, categories, mode='train'):
    total_json_count = 0
    total_matched_images = 0
    missing_entries = []

    for category in categories:
        json_path = os.path.join(label_dir, f'{mode}_crop_{category}.json') 
        img_folder = os.path.join(image_dir, category)

        with open(json_path, 'r', encoding='utf-8') as f:
            label_data = json.load(f)
        
        print(f"[{category}] JSON entries: {len(label_data)}")
        total_json_count += len(label_data)

        for row in label_data:
            filename = row['filename']
            base_filename = filename.replace('.jpg', '')
            
            matched_files = [f for f in os.listdir(img_folder) if f.startswith(base_filename) and f.endswith('.jpg')]
            
            if len(matched_files) == 0:
                missing_entries.append(os.path.join(img_folder, filename))

            total_matched_images += len(matched_files)

    print("\n[검증 결과]")
    print(f"총 JSON entries 수: {total_json_count}")
    print(f"총 매칭된 이미지 수 (원본 + 증강 포함): {total_matched_images}")
    print(f"누락된 원본 이미지 수: {len(missing_entries)}")

    if missing_entries:
        print("\n[누락 이미지 목록 예시]")
        for path in missing_entries[:10]:  # 최대 10개만 출력
            print(" -", path)

In [None]:
base_dir = os.path.dirname(os.path.abspath(__file__))


train_image_dir = os.path.join(base_dir, 'augment')  # Final/augment
train_label_dir = os.path.join(base_dir, 'CropData2', 'label', 'train')  # Final/CropData2/label/train
categories=['anger','happy','panic','sadness']


In [None]:
verify_dataset(train_image_dir, train_label_dir, categories, mode='train')

[anger] JSON entries: 1512
[happy] JSON entries: 1508
[panic] JSON entries: 1512
[sadness] JSON entries: 1521

[검증 결과]
총 JSON entries 수: 6053
총 매칭된 이미지 수 (원본 + 증강 포함): 10427
누락된 원본 이미지 수: 0
