1. Label 수정 및 EDA 용 200 data random sampling

In [54]:
## 1. 기존 json 파일 수정

import json
import os
import shutil
from tqdm import tqdm
import random

# json 파일 경로 설정
json_file_path = '../../dataset/train.json'
dataset_path = os.path.dirname(json_file_path)
print(dataset_path)

with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# 모든 'annotations'에 "segmentation": [] 추가
for annotation in json_data['annotations']:
    annotation['segmentation'] = []

# 모든 이미지 파일 이름에서 'train/' 제거
for image in json_data['images']:
    image['file_name'] = image['file_name'].replace('train/', '')

# 새 파일 이름 설정
new_json_file_name = 'train_for_label.json'
new_json_file_path = os.path.join(dataset_path, new_json_file_name)

with open(new_json_file_path, 'w') as file:
    json.dump(json_data, file, indent=4)

../../dataset


In [55]:
# 2. 파일 및 폴더 정리

train_folder_path = os.path.join(dataset_path, 'train')

new_file_name = "instances.json"
new_folder_name = "train_for_label"
new_folder_path_images = os.path.join(dataset_path, new_folder_name,'images')
new_folder_path_annotations = os.path.join(dataset_path, new_folder_name,'annotations')

if not os.path.exists(new_folder_path_images):
    os.makedirs(new_folder_path_images)

if not os.path.exists(new_folder_path_annotations):
    os.makedirs(new_folder_path_annotations)

# for file_name in os.listdir(folder_path):
for file_name in tqdm(os.listdir(train_folder_path), desc="Copying images"):

    if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        shutil.copy(os.path.join(train_folder_path, file_name), new_folder_path_images)
        
new_file_path_for_json = os.path.join(new_folder_path_annotations, new_file_name)
shutil.copy(new_json_file_path, new_file_path_for_json)

Copying images: 100%|██████████| 4884/4884 [01:03<00:00, 77.42it/s]


'../../dataset\\train_for_label\\annotations\\instances.json'

In [56]:
# 3. Random sampling for EDA

with open(new_file_path_for_json, 'r') as file:
    data = json.load(file)

total_images = 4883
sample_size = 200

# 200개의 무작위 이미지 ID 생성
selected_image_ids = random.sample(range(total_images), sample_size)

selected_filenames = [f"{str(id).zfill(4)}.jpg" for id in selected_image_ids]

# JSON 파일 불러오기
with open(new_file_path_for_json, 'r') as file:
    data = json.load(file)

filtered_images = [image for image in data['images'] if image['file_name'] in selected_filenames]
filtered_annotations = [annotation for annotation in data['annotations'] if annotation['image_id'] in selected_image_ids]

data['images'] = filtered_images
data['annotations'] = filtered_annotations


# 파일 복사
destination_folder_for_images = os.path.join(dataset_path, 'train_random_filtered_200/images')
if not os.path.exists(destination_folder_for_images):
    os.makedirs(destination_folder_for_images)

destination_folder_for_annotations = os.path.join(dataset_path, 'train_random_filtered_200/annotations')
if not os.path.exists(destination_folder_for_annotations):
    os.makedirs(destination_folder_for_annotations)

for filename in selected_filenames:
    source_file = os.path.join(new_folder_path_images, filename)
    destination_file = os.path.join(destination_folder_for_images, filename)
    
    # 파일이 존재하는 경우에만 복사
    if os.path.exists(source_file):
        shutil.copy(source_file, destination_file)

filtered_json_file_name = 'instances.json'
filtered_json_file_path = os.path.join(destination_folder_for_annotations, filtered_json_file_name)

# 수정된 JSON 파일 저장
with open(filtered_json_file_path, 'w') as file:
    json.dump(data, file, indent=4)

1. Labeling 용 데이터 분할 n/6

In [57]:
import json
import os
import shutil
from tqdm import tqdm
import random

# 기존 데이터셋 경로 설정
dataset_path = '../../dataset'

name = '종욱'

if not os.path.exists(dataset_path):
    print("Reset the dataset path")

train_for_label_path = os.path.join(dataset_path, 'train_for_label')
json_file_path = os.path.join(train_for_label_path, 'annotations/instances.json' )
image_folder_path = os.path.join(train_for_label_path, 'images' )

if not os.path.exists(train_for_label_path):
    print('위에 셀들 재실행')

num_files = len(os.listdir(image_folder_path))

parts = 6
images_per_part = num_files // parts
remainder = num_files % parts

divisions = []
start = 0
for i in range(parts):
    end = start + images_per_part + (1 if i < remainder else 0) - 1
    divisions.append((start, end))
    start = end + 1

name_lst = ['채아','민윤', '종욱', '찬종','명현','시현']

if name in name_lst:
    label_range = divisions[name_lst.index(name)]
    print("레이블러:", name, "/ Label range:", label_range)
else:
    print('입력 이름 오류, 이름 확인')

target_folder_name = 'splited_'+str(label_range[0])+'_'+str(label_range[1])
target_path = os.path.join(dataset_path, target_folder_name)
fitlered_image_folder_path = os.path.join(target_path, 'images')
fitlered_annotation_folder_path = os.path.join(target_path, 'annotations')

if not os.path.exists(target_path) or not os.path.exists(fitlered_image_folder_path) or not os.path.exists(fitlered_annotation_folder_path):
    os.makedirs(target_path)
    os.makedirs(fitlered_image_folder_path)
    os.makedirs(fitlered_annotation_folder_path)


for i in tqdm(range(label_range[0], label_range[1] + 1)):
        file_name = f"{i:04d}.jpg"  # Assuming file names are in the format '0000.jpg', '0001.jpg', etc.
        source_file = os.path.join(image_folder_path, file_name)
        target_file = os.path.join(fitlered_image_folder_path, file_name)

        # Move file if it exists
        if os.path.exists(source_file):
            shutil.copy(source_file, target_file)
            # print(f"Moved {file_name}")
        else:
            print(f"{file_name} does not exist in the source path")

with open(json_file_path, 'r') as file:
    data = json.load(file)

selected_image_ids = list(range(label_range[0], label_range[1] + 1))
selected_filenames = [f"{str(id).zfill(4)}.jpg" for id in selected_image_ids]
filtered_images = [image for image in data['images'] if image['file_name'] in selected_filenames]
filtered_annotations = [annotation for annotation in data['annotations'] if annotation['image_id'] in selected_image_ids]

data['images'] = filtered_images
data['annotations'] = filtered_annotations

filtered_json_file_name = 'instances.json'
filtered_json_file_path = os.path.join(fitlered_annotation_folder_path, filtered_json_file_name)

# 수정된 JSON 파일 저장
with open(filtered_json_file_path, 'w') as file:
    json.dump(data, file, indent=4)

print("Done")

레이블러: 종욱 / Label range: (1628, 2441)


100%|██████████| 814/814 [00:04<00:00, 178.19it/s]


Done
