In [None]:
#사용할 모든 데이터를 합쳐, KFold를 진행

import os
import json
import shutil
import random
from sklearn.model_selection import KFold

# 이미지 폴더 경로와 JSON 파일 경로
image_dirs = [
    "/data/ephemeral/home/MCG/NewData/SROIE2019/combined/images",
    "/data/ephemeral/home/MCG/BaseData/chi/train",
    "/data/ephemeral/home/MCG/BaseData/jap/train",
    "/data/ephemeral/home/MCG/BaseData/thai/train",
    "/data/ephemeral/home/MCG/BaseData/vie/train"
]
json_paths = [
    "/data/ephemeral/home/MCG/NewData/SROIE2019/combined/annotations.json",
    "/data/ephemeral/home/MCG/code/data/chinese_receipt/ufo/train.json",
    "/data/ephemeral/home/MCG/code/data/japanese_receipt/ufo/train.json",
    "/data/ephemeral/home/MCG/code/data/thai_receipt/ufo/train.json",
    "/data/ephemeral/home/MCG/code/data/vietnamese_receipt/ufo/train.json"
]

# 결과를 저장할 폴더 경로
output_dir = "/data/ephemeral/home/MCG/KFold_NoClean_Output"
os.makedirs(output_dir, exist_ok=True)

# JSON 데이터를 하나의 딕셔너리로 병합
all_annotations = {}
for json_path in json_paths:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        all_annotations.update(data['images'])

# 이미지 파일 목록 생성
image_files = []
for image_dir in image_dirs:
    for file_name in os.listdir(image_dir):
        if file_name.endswith('.jpg') and file_name in all_annotations:
            image_files.append((os.path.join(image_dir, file_name), file_name))

# KFold로 데이터셋 분할
kf = KFold(n_splits=4, shuffle=True, random_state=42)
for fold, (train_index, val_index) in enumerate(kf.split(image_files), 1):
    fold_dir = os.path.join(output_dir, f'fold_{fold}')
    train_dir = os.path.join(fold_dir, 'train')
    val_dir = os.path.join(fold_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # 학습 및 검증 세트 생성
    train_images = [image_files[i] for i in train_index]
    val_images = [image_files[i] for i in val_index]

    # 주석 파일 생성
    train_annotations = {"images": {img[1]: all_annotations[img[1]] for img in train_images}}
    val_annotations = {"images": {img[1]: all_annotations[img[1]] for img in val_images}}

    # 이미지 및 주석 저장
    for img_path, img_name in train_images:
        shutil.copy(img_path, os.path.join(train_dir, img_name))
    for img_path, img_name in val_images:
        shutil.copy(img_path, os.path.join(val_dir, img_name))

    with open(os.path.join(train_dir, 'annotations.json'), 'w', encoding='utf-8') as f:
        json.dump(train_annotations, f, ensure_ascii=False, indent=4)
    with open(os.path.join(val_dir, 'annotations.json'), 'w', encoding='utf-8') as f:
        json.dump(val_annotations, f, ensure_ascii=False, indent=4)

print("데이터셋이 성공적으로 KFold로 분할되었습니다.")


데이터셋이 성공적으로 KFold로 분할되었습니다.


In [None]:
#KFold 데이터 검토. / Train과 validation에 겹치는 파일이 없는지 확인인

# 파일 경로
val_images_path = '/data/ephemeral/home/MCG/code/val_images.txt'
train_images_path = '/data/ephemeral/home/MCG/code/train_images.txt'

# 파일에서 각 줄마다 파일명을 읽고, 겹치는 파일명을 찾기
def find_overlapping_files(val_path, train_path):
    with open(val_path, 'r') as val_file:
        val_images = set(line.strip() for line in val_file)

    with open(train_path, 'r') as train_file:
        train_images = set(line.strip() for line in train_file)

    # 겹치는 파일명 찾기
    overlapping_files = val_images.intersection(train_images)
    
    if overlapping_files:
        print("겹치는 파일명:")
        for file in overlapping_files:
            print(file)
    else:
        print("겹치는 파일명이 없습니다.")

# 함수 실행
find_overlapping_files(val_images_path, train_images_path)
