In [1]:
import json
import pandas as pd

In [2]:
icd_path = '/opt/ml/input/data/ICDAR17_Korean/ufo/train.json'
drv_path = '/opt/ml/input/data/Upstage/ufo/train.json'
save_path = '/opt/ml/input/data/Upstage/ufo/new_train.json'

with open(icd_path, 'r') as f:
    icd = json.load(f)
with open(drv_path, 'r') as f:
    drv = json.load(f)

### 최상위 레벨의 keys 비교

In [3]:
print(icd.keys())
print(drv.keys())

dict_keys(['images'])
dict_keys(['images', 'version', 'tags'])


In [4]:
print(drv['version'])
print(drv['tags'])

221207
[]


In [5]:
## drv의 version, tags는 삭제하거나 무시해도 될듯
## 원본에는 남겨두되, 통일된 포맷에서는 삭제

### images 레벨의 keys 비교

In [6]:
ext_icd = set()
ext_drv = set()

for i in icd['images'].keys():
    ext_icd.add(i.split('.')[-1])
for i in drv['images'].keys():
    ext_drv.add(i.split('.')[-1])
    
print(ext_icd)
print(ext_drv)

{'jpg'}
{'jpg', 'JPG', 'jpeg'}


In [7]:
## images 레벨에서는 이미지 파일 이름만을 가짐
## 조치사항 없음

### 이미지 1장 레벨의 keys 비교

In [8]:
print(icd['images']['img_4380.jpg'].keys())
print(drv['images']['0F885DC0-3E65-4081-9DBB-CA96BB6FD4FC.JPG'].keys())

dict_keys(['img_h', 'img_w', 'words', 'tags', 'license_tag'])
dict_keys(['paragraphs', 'words', 'chars', 'img_w', 'img_h', 'tags', 'relations', 'annotation_log', 'license_tag'])


In [9]:
allimg_drv = pd.DataFrame(drv['images']).T

print(sum(allimg_drv['paragraphs'] != {}))
print(sum(allimg_drv['chars'] != {}))
print(sum(allimg_drv['relations'] != {}))
print(allimg_drv['annotation_log'].apply(lambda x : tuple(x.items())).value_counts())


0
0
0
((worker, NamHyeok), (timestamp, 2022-12-07), (tool_version, ), (source, None))    961
Name: annotation_log, dtype: int64


In [10]:
## drv에서만 존재하는 paragraphs, chars, relations, annotation_log 레벨은 별다른 값을 갖지 않음
## 원본에는 남겨두되, 통일된 포맷에서는 삭제

In [11]:
# img_h, img_w, words, tags, license_tag 살펴보기 (공통레벨)

com_icd = pd.DataFrame(icd['images']).T[['img_h', 'img_w', 'words', 'tags', 'license_tag']]
com_drv = pd.DataFrame(drv['images']).T[['img_h', 'img_w', 'words', 'tags', 'license_tag']]

In [12]:
## 1. img_h, img_w
print(com_icd['img_h'].unique())
print(com_icd['img_w'].unique())
print(com_drv['img_h'].unique())
print(com_drv['img_w'].unique())

## 특이사항 없음

[2448 2268 3456 1836 2328 4128 2592 3096 3144 1440 1936 1080 3120 3024
 1709 3984 1794 2340 2250 1755 1830 1452 2000 2067 3264 1742]
[1836 2268 2592 3144 3096 2448 3888 4128 3456 3024 2328 1080 1440 2340
 2277 2238 2391 3120 3372 1830 3264 1944 2992 2147]
[1440 3265 700 1212 4032 3024 4618 1098 1880 9248 2268 6936 2158 3097 4608
 3456 2243 540 720 1080 898 932 1920 2956 784 1205 455 487 364 531 960 810
 360 769 351 1504 1696 2774 1772 1012 1172 1128 1280 2160 2220 1334]
[1440 4898 618 991 3024 4032 3464 1752 1060 6936 2268 9248 2323 3456 4608
 3988 960 1920 1080 3941 1439 677 770 1437 1280 682 810 540 480 720 590
 409 1128 2261 2773 3840 2280 1242 2160 467 1334]


In [13]:
## 2. words
ann_icd = pd.concat({k: pd.DataFrame(v) for k, v in com_icd['words'].items()}, axis=1).T
ann_drv = pd.concat({k: pd.DataFrame(v) for k, v in com_drv['words'].items()}, axis=1).T

print(ann_icd.columns)
print(ann_drv.columns)

Index(['points', 'transcription', 'language', 'illegibility', 'orientation',
       'word_tags'],
      dtype='object')
Index(['transcription', 'points', 'orientation', 'language', 'tags',
       'confidence', 'illegibility'],
      dtype='object')


In [14]:
## 2-1. words _ word_tags, tags, confidence (차이)

print(sum(ann_icd['word_tags'].notnull()))
## ann_icd 중 word_tags는 모두 null (삭제 or 무시)

print(sum(ann_drv['confidence'].notnull()))

exc_reg_list = ann_drv['tags'][ann_drv['tags'].apply(lambda x : len(x)) != 0][ann_drv['tags'][ann_drv['tags'].apply(lambda x : len(x)) != 0].apply(lambda x : x[0]) == 'excluded-region'].index
no_trans_list = ann_drv['transcription'][ann_drv['transcription'].isnull()].index
for i in range(len(exc_reg_list)):
    if exc_reg_list[i] != no_trans_list[i]:
        print(i)
print(len(exc_reg_list), len(no_trans_list))
## ann_drv중 confidence는 모두 null (삭제 or 무시)
## ann_drv 중 'tags'에 'excluded-region'는 'transcription'이 None인 것과 완벽히 일치
## transcription이 None인 것을 지우면 'tags'-'excluded-region'은 신경쓰지 않아도 될듯

0
0
2273 2273


In [15]:
## 2-2. words _ points, transcription, language, illegibility, orientation (공통)


print('-'*30, '<words _ points>', sep='\n', end='\n\n')
## 2-2-1. words _ points (icd_4078, drv_23627)
print(sum(ann_icd['points'].apply(lambda x : type(x)) != type([])))
print(ann_icd['points'].apply(lambda x : len(x)).value_counts())
## points는 모두 list 타입
## 빈 값은 없으며, 모두 4개의 점으로 구성

print("-------------------------------------------------------------")
print(sum(ann_drv['points'].apply(lambda x : type(x)) != type([])))
print(sum(ann_drv['points'].apply(lambda x : len(x)).value_counts()))
print(ann_drv['points'].apply(lambda x : len(x)).value_counts())
## points는 모두 list 타입
## 빈 값은 없으며, 점의 개수는 다양 (모두 짝수)

------------------------------
<words _ points>

0
4    4078
Name: points, dtype: int64
-------------------------------------------------------------
0
22537
4    22537
Name: points, dtype: int64


In [16]:
print('-'*30, '<words _ transcription, illegibility>', sep='\n', end='\n\n')
## 2-2-2. words _ transcription, illegibility
print(ann_icd['illegibility'].unique())
none_ann_list = ann_icd[ann_icd['transcription'] == '###'].index
ille_ann_list = ann_icd[ann_icd['illegibility'] == True].index
for i in range(len(none_ann_list)):
    if none_ann_list[i] != ille_ann_list[i]:
        print(i)
print(len(none_ann_list), len(ille_ann_list))
## illegibility는 True, False만 보유
## transcription이 ###인 데이터는 illegibility가 True (transcription을 위주로 살펴도 될듯)
print(ann_drv['illegibility'].unique())
print(ann_drv[ann_drv['illegibility'] == True]['transcription'].unique())
## illegibility는 True, False만 보유
## illegibility가 True인 것은 transcription이 None인 것을 포함하지만, None이 아닌 것들은 거의 읽기 불가능한 수준 (무시가능)

------------------------------
<words _ transcription, illegibility>

[False True]
470 470
[False True]
[None '매일채움견과' 'ARK' '연' '세' '대' '학' '1885' 'EE' 'ion' 'an' 'SAVE ZONE'
 'CHIANTI' '칙' 'One' '건물' '통행에' '이륜차' ' ' '' 'TWO' 'HAN' 'mation' '편의시설'
 '니는' '물처럼' '<UNK>' '한' '칼' '전문금융' '100억원' '이내' '8' 'MISSI' '매매' '임대'
 'Carlsberg ' '치과' '서울바른후치과의원' '바른후치과' '박애별산부인과' '국민은행' 'KB' '지하' '헤'
 '망고 ' '스' '<UNK><UNK>' '사' '곱창' '공']


In [17]:
print('-'*30, '<words _ language>', sep='\n', end='\n\n')
## 2-2-3. words _ language
print(sum(ann_icd['language'].apply(lambda x : len(x)) != 1))
print(ann_icd['language'].apply(lambda x : x[0]).value_counts())
## language는 ko 또는 en이며, 2개 이상의 값은 없음
print(sum(ann_drv['language'].isnull()))
print(ann_drv[ann_drv['language'].notnull()]['language'].apply(lambda x : tuple(x)).value_counts())
## language는 null 또는 ko, en, others의 조합 (2개 이상 값 존재)


print('\n', '-'*30, '<words _ orientation>', sep='\n', end='\n\n')
## 2-2-4. words _ orientation
print(ann_icd['orientation'].value_counts())
## 모든 글자방향이 Horizontal
print(ann_drv['orientation'].value_counts())
## 글자방향은 Horizontal, Vertical, Irregular이 존재

------------------------------
<words _ language>

0
ko    2946
en    1132
Name: language, dtype: int64
5283
(en,)               8509
(ko,)               8067
(others,)            350
(en, others)         170
(ko, en)             120
(ko, others)          36
(ko, en, others)       2
Name: language, dtype: int64


------------------------------
<words _ orientation>

Horizontal    4078
Name: orientation, dtype: int64
Horizontal    19855
Vertical        368
Irregular        41
Name: orientation, dtype: int64


In [18]:
## 3. tags, license_tag

print(sum(com_icd['tags'].notnull()))
print(com_drv['tags'].apply(lambda x : tuple(x)).value_counts())
print(sum(com_drv['tags'].apply(lambda x : len(x)) != 0) / len(com_drv))
## com_icd는 null만을 소유
## com_drv의 8%가 tags를 소유, 학습하는건 크게 의미가 없을듯
## 오류분석 시 활용될 수도 있을듯

print('\n', '-'*30)
print(com_icd['license_tag'].apply(lambda x : tuple(x.items())).value_counts())
print(com_drv['license_tag'].apply(lambda x : tuple(x.items())).value_counts())
## license_tag는 별다른 내용을 갖지 않음 (무시 가능할듯)

0
()                         881
(handwriting,)              25
(outfocus,)                 24
(document,)                 14
(noisy,)                    13
(noisy, outfocus)            2
(handwriting, note)          1
(handwriting, outfocus)      1
Name: tags, dtype: int64
0.08324661810613944

 ------------------------------
((usability, True), (public, True), (commercial, True), (type, CC-BY-SA), (holder, None))    536
Name: license_tag, dtype: int64
((usability, True), (public, False), (commercial, True), (type, None), (holder, Upstage))    961
Name: license_tag, dtype: int64


In [19]:
# 최소구조

# images
### filename
##### img_h
##### img_w
##### words
####### points
####### transcription
####### language
####### illegibility
####### orientation
##### tags
##### license_tag

### words가 없는 images 제거

In [20]:
com_drv['words'][com_drv['words'].apply(lambda x : x.values()).apply(lambda x : len(x)) == 0]

Series([], Name: words, dtype: object)

### point가 4개 이상인 polygon 확인

In [21]:
for i in drv['images'].keys():
    for key in drv['images'][i]['words'].keys():
        if len(drv['images'][i]['words'][key]['points']) != 4:
            print(len(drv['images'][i]['words'][key]['points']))

In [22]:
# words가 없는 이미지를 삭제한 dict 생성
import copy
temp_drv = copy.deepcopy(drv)

for i in drv['images'].keys():
    for key in drv['images'][i]['words'].keys():
        if len(drv['images'][i]['words'][key]['points']) != 4:
            # print(len(drv['images'][i]['words'][key]['points']))
            del temp_drv['images'][i]['words'][key]        

result_drv = copy.deepcopy(temp_drv)

for i in temp_drv['images'].keys():
    if len(temp_drv['images'][i]['words'].keys()) == 0:
        del result_drv['images'][i]


# for i in com_drv['words'][com_drv['words'].apply(lambda x : x.values()).apply(lambda x : len(x)) == 0].keys():
#     del new_drv['images'][i]
    
test = pd.DataFrame(result_drv['images']).T[['img_h', 'img_w', 'words', 'tags', 'license_tag']]
test

Unnamed: 0,img_h,img_w,words,tags,license_tag
0F885DC0-3E65-4081-9DBB-CA96BB6FD4FC.JPG,1440,1440,"{'0001': {'transcription': 'BUSKERS', 'points'...",[],"{'usability': True, 'public': False, 'commerci..."
0N8A5655.jpg,3265,4898,"{'0001': {'transcription': '이로운', 'points': [[...",[],"{'usability': True, 'public': False, 'commerci..."
0N8A5660.jpg,3265,4898,"{'0001': {'transcription': '이로운', 'points': [[...",[],"{'usability': True, 'public': False, 'commerci..."
01.jpg,700,618,"{'0001': {'transcription': '단순하지만', 'points': ...",[handwriting],"{'usability': True, 'public': False, 'commerci..."
02.jpg,1212,991,"{'0002': {'transcription': '2F', 'points': [[5...",[],"{'usability': True, 'public': False, 'commerci..."
...,...,...,...,...,...
1632655308620-22.jpg,960,467,"{'0001': {'transcription': None, 'points': [[1...",[],"{'usability': True, 'public': False, 'commerci..."
1632655308620-25.jpg,2220,1080,"{'0001': {'transcription': '읽으면', 'points': [[...",[],"{'usability': True, 'public': False, 'commerci..."
1632655330045-21.jpg,1334,1334,"{'0001': {'transcription': '압구정공주떡', 'points':...",[],"{'usability': True, 'public': False, 'commerci..."
1632655330045-23.jpg,960,467,"{'0001': {'transcription': '페리카나', 'points': [...",[],"{'usability': True, 'public': False, 'commerci..."


### 포인트 4개인 박스만 남았는지 확인

In [23]:
# 출력되는 값이 없어야 함
for i in result_drv['images'].keys():
    for key in result_drv['images'][i]['words'].keys():
        if len(result_drv['images'][i]['words'][key]['points']) != 4:
            print(len(result_drv['images'][i]['words'][key]['points']))

### word가 없는 이미지 제거 확인

In [24]:
# 출력되는 값이 없어야 함
for i in result_drv['images'].keys():
    if len(result_drv['images'][i]['words'].keys()) == 0:
        print(i)

In [25]:
# 현재 경로에 'new_train.json'으로 저장
with open(save_path, 'w') as f:
    json.dump(result_drv, f, indent=2)