### 중국어 mislabel 수정 (좌표 순서 -- 좌상단 우상단 우하단 좌하단)

In [7]:
import json
import numpy as np
import os

# JSON 파일 경로
json_path = '/data/ephemeral/data/chinese_receipt/ufo/train.json'

# JSON 파일 읽기
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

def correct_points(points):
    # points를 NumPy 배열로 변환
    points = np.array(points)

    # 좌상단 점 찾기
    top_left_idx = np.argmin(points[:, 1])  # y 좌표가 가장 작은 인덱스
    top_left = points[top_left_idx]

    # 나머지 점들의 인덱스
    remaining_indices = [i for i in range(4) if i != top_left_idx]

    # 나머지 점들을 오른쪽으로 정렬
    top_right_idx = remaining_indices[0] if points[remaining_indices[0], 0] > top_left[0] else remaining_indices[1]
    bottom_right_idx = remaining_indices[1] if top_right_idx == remaining_indices[0] else remaining_indices[0]

    # 좌하단 점 찾기
    bottom_left_idx = [i for i in remaining_indices if i not in (top_right_idx, bottom_right_idx)][0]

    # 올바른 순서로 재배치
    corrected_points = [top_left.tolist(), points[top_right_idx].tolist(), points[bottom_right_idx].tolist(), points[bottom_left_idx].tolist()]
    
    return corrected_points

# 레이블 순서 확인 및 수정
for image_id, image_info in data.get('images', {}).items():
    for word_id, word_info in image_info.get('words', {}).items():
        original_points = word_info['points']
        corrected_points = correct_points(original_points)
        
        # 원래 점과 수정된 점을 비교하여 변경사항이 있는지 확인
        if not np.array_equal(np.array(original_points), np.array(corrected_points)):
            print(f"Correcting points for image: {image_id}, word: {word_id}")
            word_info['points'] = corrected_points

# 수정된 JSON 파일 저장
corrected_json_path = '/data/ephemeral/data/chinese_receipt/ufo/train_corrected.json'
with open(corrected_json_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Corrected JSON saved to: {corrected_json_path}")


Correcting points for image: extractor.zh.in_house.appen_000692_page0001.jpg, word: 0002
Correcting points for image: extractor.zh.in_house.appen_000692_page0001.jpg, word: 0021
Correcting points for image: extractor.zh.in_house.appen_000692_page0001.jpg, word: 0032
Correcting points for image: extractor.zh.in_house.appen_000692_page0001.jpg, word: 0033
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0002
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0003
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0009
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0011
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0018
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0019
Correcting points for image: extractor.zh.in_house.appen_000835_page0001.jpg, word: 0020
Correcting points for

### 일본어

In [None]:
import json
import numpy as np
import os

# JSON 파일 경로
json_path = '/data/ephemeral/data/japanese_receipt/ufo/train.json'

# JSON 파일 읽기
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

def correct_points(points):
    # points를 NumPy 배열로 변환
    points = np.array(points)

    # 좌상단 점 찾기
    top_left_idx = np.argmin(points[:, 1])  # y 좌표가 가장 작은 인덱스
    top_left = points[top_left_idx]

    # 나머지 점들의 인덱스
    remaining_indices = [i for i in range(4) if i != top_left_idx]

    # 나머지 점들을 오른쪽으로 정렬
    top_right_idx = remaining_indices[0] if points[remaining_indices[0], 0] > top_left[0] else remaining_indices[1]
    bottom_right_idx = remaining_indices[1] if top_right_idx == remaining_indices[0] else remaining_indices[0]

    # 좌하단 점 찾기
    bottom_left_idx = [i for i in remaining_indices if i not in (top_right_idx, bottom_right_idx)][0]

    # 올바른 순서로 재배치
    corrected_points = [top_left.tolist(), points[top_right_idx].tolist(), points[bottom_right_idx].tolist(), points[bottom_left_idx].tolist()]
    
    return corrected_points

# 레이블 순서 확인 및 수정
for image_id, image_info in data.get('images', {}).items():
    for word_id, word_info in image_info.get('words', {}).items():
        original_points = word_info['points']
        corrected_points = correct_points(original_points)
        
        # 원래 점과 수정된 점을 비교하여 변경사항이 있는지 확인
        if not np.array_equal(np.array(original_points), np.array(corrected_points)):
            print(f"Correcting points for image: {image_id}, word: {word_id}")
            word_info['points'] = corrected_points

# 수정된 JSON 파일 저장
corrected_json_path = '/data/ephemeral/data/japanese_receipt/ufo/train_corrected.json'
with open(corrected_json_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Corrected JSON saved to: {corrected_json_path}")


### 태국어

In [None]:
import json
import numpy as np
import os

# JSON 파일 경로
json_path = '/data/ephemeral/data/thai_receipt/ufo/train.json'

# JSON 파일 읽기
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

def correct_points(points):
    # points를 NumPy 배열로 변환
    points = np.array(points)

    # 좌상단 점 찾기
    top_left_idx = np.argmin(points[:, 1])  # y 좌표가 가장 작은 인덱스
    top_left = points[top_left_idx]

    # 나머지 점들의 인덱스
    remaining_indices = [i for i in range(4) if i != top_left_idx]

    # 나머지 점들을 오른쪽으로 정렬
    top_right_idx = remaining_indices[0] if points[remaining_indices[0], 0] > top_left[0] else remaining_indices[1]
    bottom_right_idx = remaining_indices[1] if top_right_idx == remaining_indices[0] else remaining_indices[0]

    # 좌하단 점 찾기
    bottom_left_idx = [i for i in remaining_indices if i not in (top_right_idx, bottom_right_idx)][0]

    # 올바른 순서로 재배치
    corrected_points = [top_left.tolist(), points[top_right_idx].tolist(), points[bottom_right_idx].tolist(), points[bottom_left_idx].tolist()]
    
    return corrected_points

# 레이블 순서 확인 및 수정
for image_id, image_info in data.get('images', {}).items():
    for word_id, word_info in image_info.get('words', {}).items():
        original_points = word_info['points']
        corrected_points = correct_points(original_points)
        
        # 원래 점과 수정된 점을 비교하여 변경사항이 있는지 확인
        if not np.array_equal(np.array(original_points), np.array(corrected_points)):
            print(f"Correcting points for image: {image_id}, word: {word_id}")
            word_info['points'] = corrected_points

# 수정된 JSON 파일 저장
corrected_json_path = '/data/ephemeral/data/thai_receipt/ufo/train_corrected.json'
with open(corrected_json_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Corrected JSON saved to: {corrected_json_path}")


### 베트남어

In [None]:
import json
import numpy as np
import os

# JSON 파일 경로
json_path = '/data/ephemeral/data/vietnamese_receipt/ufo/train.json'

# JSON 파일 읽기
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

def correct_points(points):
    # points를 NumPy 배열로 변환
    points = np.array(points)

    # 좌상단 점 찾기
    top_left_idx = np.argmin(points[:, 1])  # y 좌표가 가장 작은 인덱스
    top_left = points[top_left_idx]

    # 나머지 점들의 인덱스
    remaining_indices = [i for i in range(4) if i != top_left_idx]

    # 나머지 점들을 오른쪽으로 정렬
    top_right_idx = remaining_indices[0] if points[remaining_indices[0], 0] > top_left[0] else remaining_indices[1]
    bottom_right_idx = remaining_indices[1] if top_right_idx == remaining_indices[0] else remaining_indices[0]

    # 좌하단 점 찾기
    bottom_left_idx = [i for i in remaining_indices if i not in (top_right_idx, bottom_right_idx)][0]

    # 올바른 순서로 재배치
    corrected_points = [top_left.tolist(), points[top_right_idx].tolist(), points[bottom_right_idx].tolist(), points[bottom_left_idx].tolist()]
    
    return corrected_points

# 레이블 순서 확인 및 수정
for image_id, image_info in data.get('images', {}).items():
    for word_id, word_info in image_info.get('words', {}).items():
        original_points = word_info['points']
        corrected_points = correct_points(original_points)
        
        # 원래 점과 수정된 점을 비교하여 변경사항이 있는지 확인
        if not np.array_equal(np.array(original_points), np.array(corrected_points)):
            print(f"Correcting points for image: {image_id}, word: {word_id}")
            word_info['points'] = corrected_points

# 수정된 JSON 파일 저장
corrected_json_path = '/data/ephemeral/data/vietnamese_receipt/ufo/train_corrected.json'
with open(corrected_json_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Corrected JSON saved to: {corrected_json_path}")
