In [36]:
## 1. 기존 json 파일 수정

import json
import os
import shutil
from tqdm import tqdm
import random

# json 파일 경로 설정
json_file_path = '../train.json'
dataset_path = os.path.dirname(json_file_path)
print(dataset_path)

with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# 모든 'annotations'에 "segmentation": [] 추가
for annotation in json_data['annotations']:
    annotation['segmentation'] = []

# 모든 이미지 파일 이름에서 'train/' 제거
for image in json_data['images']:
    image['file_name'] = image['file_name'].replace('train/', '')

# 새 파일 이름 설정
new_json_file_name = 'train_for_label.json'
new_json_file_path = os.path.join(dataset_path, new_json_file_name)

with open(new_json_file_path, 'w') as file:
    json.dump(json_data, file, indent=4)

D:/level2/data/dataset


In [37]:
# 2. 파일 및 폴더 정리

train_folder_path = os.path.join(dataset_path, 'train')

new_file_name = "instances.json"
new_folder_name = "train_for_label"
new_folder_path_images = os.path.join(dataset_path, new_folder_name,'images')
new_folder_path_annotations = os.path.join(dataset_path, new_folder_name,'annotations')

if not os.path.exists(new_folder_path_images):
    os.makedirs(new_folder_path_images)

if not os.path.exists(new_folder_path_annotations):
    os.makedirs(new_folder_path_annotations)

# for file_name in os.listdir(folder_path):
for file_name in tqdm(os.listdir(train_folder_path), desc="Copying images"):

    if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        shutil.copy(os.path.join(train_folder_path, file_name), new_folder_path_images)
        
new_file_path_for_json = os.path.join(new_folder_path_annotations, new_file_name)
shutil.copy(new_json_file_path, new_file_path_for_json)

Copying images: 100%|██████████| 4884/4884 [00:06<00:00, 699.62it/s]


'D:/level2/data/dataset\\train_for_label\\annotations\\instances.json'

In [40]:
# 3. Random sampling for EDA

with open(new_file_path_for_json, 'r') as file:
    data = json.load(file)

total_images = 4883
sample_size = 200

# 200개의 무작위 이미지 ID 생성
selected_image_ids = random.sample(range(total_images), sample_size)

selected_filenames = [f"{str(id).zfill(4)}.jpg" for id in selected_image_ids]

# JSON 파일 불러오기
with open(new_file_path_for_json, 'r') as file:
    data = json.load(file)

filtered_images = [image for image in data['images'] if image['file_name'] in selected_filenames]
filtered_annotations = [annotation for annotation in data['annotations'] if annotation['image_id'] in selected_image_ids]

data['images'] = filtered_images
data['annotations'] = filtered_annotations


# 파일 복사
destination_folder_for_images = os.path.join(dataset_path, 'train_random_filtered_200/images')
if not os.path.exists(destination_folder_for_images):
    os.makedirs(destination_folder_for_images)

destination_folder_for_annotations = os.path.join(dataset_path, 'train_random_filtered_200/annotations')
if not os.path.exists(destination_folder_for_annotations):
    os.makedirs(destination_folder_for_annotations)

for filename in selected_filenames:
    source_file = os.path.join(new_folder_path_images, filename)
    destination_file = os.path.join(destination_folder_for_images, filename)
    
    # 파일이 존재하는 경우에만 복사
    if os.path.exists(source_file):
        shutil.copy(source_file, destination_file)

filtered_json_file_name = 'instances.json'
filtered_json_file_path = os.path.join(destination_folder_for_annotations, filtered_json_file_name)

# 수정된 JSON 파일 저장
with open(filtered_json_file_path, 'w') as file:
    json.dump(data, file, indent=4)