### KAIST Scene Text Database 전처리기:
1. 데이터 url:http://www.iapr-tc11.org/mediawiki/index.php/KAIST_Scene_Text_Database
1. 압축을 다 풀고 'kaist_dataset'아래에 이미지 데이터와 xml 파일을 두면
1. 각 xml 파일에서 글자하나하나 위치정보를 읽어 원본 이미지에서 글자를 별도의 이미지 파일로 저장하는 작업을 수행한다.
1. 결과물은 char_data 아래에 각 글자명으로 폴더를 만들고 그 아래에 원본파일명.글자명.idx.jpg로 저장
    - char_data/{character}/{img_filename}.{character}.{idx}.jpg
    - character가 .(dot)인 경우에는 {character}값을 dot으로 대체하여 처리
    

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import xml.etree.ElementTree

from os import listdir, path, makedirs
from os.path import isfile, isdir, join, basename

img_base_path = 'kaist_dataset'

def plot_img_inline(img):
    plt.figure()
    plt.imshow(np.asarray(img))
    plt.show()

char_count = dict()

def get_unique_img_name(target_path, index_name, char_name):
    idx = 0
    image_filename = '%s.jpg' %('_'.join([index_name, str(idx)]))
    full_path = join(target_path, image_filename)
    
    while path.exists(full_path):
        idx = idx + 1
        image_filename = '%s.jpg' %('_'.join([index_name, str(idx)]))
        full_path = join(target_path, image_filename)
    return full_path

def print_img(img, base_path, index_name, char_name):
    target_path = join(base_path, char_name)
    if not path.exists(target_path):
        makedirs(target_path)

    # 이미지 데이터들이 여러 폴더에 걸쳐있어서 같은 파일명과 같은 글자를 갖는 경우가 많음
    # 뒤에 idx값을 추가해 서로 다른 데이터가 다른 파일명을 갖도록 함.
    full_path = get_unique_img_name(target_path, index_name, char_name)
    img.save(full_path)
    
    if char_name not in char_count:
        char_count[char_name] = 1
    else:
        char_count[char_name] = char_count[char_name] + 1
        
    return full_path, char_name
    
def crop_character_images(base_path, index_name, output_path='char_data'):
    image_filename = '.'.join([index_name, 'jpg'])
    xml_filename = '.'.join([index_name, 'xml'])
    
    # xml 파일에 확장자를 빠트린 경우가 많음. xml 파일이 없는 경우에는 xml파일명을 확장자를 제외한 것으로 설정
    if path.exists(join(base_path, xml_filename)) is False:
        xml_filename = index_name
    im = Image.open(join(base_path, image_filename))
    
#     plot_img_inline(im)
    e = xml.etree.ElementTree.parse(join(base_path, xml_filename)).getroot()

    char_result = []
    for image in e:
        for words in image:
            if words.tag == 'words':
                for word in words:
                    for char in word:
                        if len(char.attrib['char']) != 1:
                            continue
                            
                        # 글자가 .인경우 dot으로 대체
                        if char.attrib['char'] == '.':
                            char.attrib['char'] = 'dot'
                            
                        # box 영역으로 crop
                        box = (int(char.attrib['x']), int(char.attrib['y']), int(char.attrib['x']) + int(char.attrib['width']), int(char.attrib['y']) + int(char.attrib['height']))
                        region = im.crop(box)
#                         plot_img_inline(region)

                        # 파일로 저장
                        (full_path, char_name) = print_img(region, output_path, index_name, char.attrib['char'])
                        char_result.append({
                            'path': full_path,
                            'char': char_name,
                            'x1': int(char.attrib['x']),
                            'y1': int(char.attrib['y']),
                            'x2': int(char.attrib['x']) + int(char.attrib['width']),
                            'y2': int(char.attrib['y']) + int(char.attrib['height']),
                            'width': int(char.attrib['width']),
                            'height': int(char.attrib['height']),
                            'seg_areas': int(char.attrib['width']) * int(char.attrib['height']),
                            'origin_path': join(base_path, image_filename)
                        })
    return char_result

from os import listdir, path, makedirs
from os.path import isfile, isdir, join, basename

img_base_path = 'kaist_dataset'

fail_filenames = []

char_data_list = []
def get_recursive_dirlist(base_path):
    global char_data_list
    
    # dirname을 얻어옴
    sub_dirnames = [f for f in listdir(base_path) if isdir(join(base_path, f))]
    for dirname in sub_dirnames:
        # 각 dirname별로  recursive 호출
        next_base_path = join(base_path, dirname)
        get_recursive_dirlist(next_base_path)
        
        # 현재 dir에 jpg파일이 있는지 확인
        # KAIST데이터셋은 xxx.jpg, xxx.xml, xxx.bmp 형식으로 있기 때문에, 
        # jpg파일 목록을 가져와서 이미지 데이터 목록을 확보
        data_filenames = [path.splitext(f)[0] for f in listdir(next_base_path) if isfile(join(next_base_path, f)) and f.lower().endswith('.jpg')]        
        for x in data_filenames:
            try:
                char_result = crop_character_images(next_base_path, x)
                char_data_list = char_data_list + char_result
            except:
                # 데이터 형식이 잘못된 경우가 많아 exception이 발생한 경우는 그냥 버림. 대신 뭐가 문제였는지만 출력
                print('###EXCEPTION', next_base_path, x)
                fail_filenames.append((next_base_path, x))
    
get_recursive_dirlist(img_base_path)

char_data_list

###EXCEPTION kaist_dataset/English/Digital_Camera/(E.S)F-others 4
###EXCEPTION kaist_dataset/Korean/Digital_Camera/E-night 3
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 080116-0079
###EXCEPTION kaist_dataset/Korean/Digital_Camera/indoor1 080119-0001
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor1 99
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor2 080119-0031
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor2 2007-12-08-day-122
###EXCEPTION kaist_dataset/Mixed/Digital_Camera/(C.S)C-outdoor1 063
###EXCEPTION kaist_dataset/Mixed/Digital_Camera/(C.S)C-outdoor1 080119-0003


[{'char': 'T',
  'height': 105,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.jpg',
  'path': 'char_data/T/DSC02317_2.jpg',
  'seg_areas': 6615,
  'width': 63,
  'x1': 101,
  'x2': 164,
  'y1': 225,
  'y2': 330},
 {'char': 'R',
  'height': 94,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.jpg',
  'path': 'char_data/R/DSC02317_2.jpg',
  'seg_areas': 5546,
  'width': 59,
  'x1': 165,
  'x2': 224,
  'y1': 231,
  'y2': 325},
 {'char': 'E',
  'height': 109,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.jpg',
  'path': 'char_data/E/DSC02317_2.jpg',
  'seg_areas': 6322,
  'width': 58,
  'x1': 218,
  'x2': 276,
  'y1': 228,
  'y2': 337},
 {'char': 'N',
  'height': 97,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.jpg',
  'path': 'char_data/N/DSC02317_2.jpg',
  'seg_areas': 6208,
  'width': 64,
  'x1': 271,
  'x2': 335,
  'y1': 227,
  'y2': 324},
 {'char': 'D',
  'heig

In [18]:
import pandas as pd

dataDF = pd.DataFrame(char_data_list)
dataDF['isTrain'] = True

charList = sorted(dataDF['char'].unique())



dataDF['label'] = dataDF['char'].apply(lambda char: charList.index(char) + 1)

print(sorted(dataDF['label'].unique()))

dataDF

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

Unnamed: 0,char,height,origin_path,path,seg_areas,width,x1,x2,y1,y2,isTrain,label
0,T,105,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/T/DSC02317_2.jpg,6615,63,101,164,225,330,True,43
1,R,94,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/R/DSC02317_2.jpg,5546,59,165,224,231,325,True,41
2,E,109,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/E/DSC02317_2.jpg,6322,58,218,276,228,337,True,28
3,N,97,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/N/DSC02317_2.jpg,6208,64,271,335,227,324,True,37
4,D,89,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/D/DSC02317_2.jpg,7031,79,328,407,223,312,True,27
5,I,93,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/I/DSC02317_2.jpg,3255,35,405,440,224,317,True,32
6,C,105,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/C/DSC02317_2.jpg,7875,75,439,514,202,307,True,26
7,A,113,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/A/DSC02317_2.jpg,9492,84,518,602,205,318,True,24
8,M,85,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/M/DSC02423_2.jpg,6375,75,0,75,149,234,True,36
9,O,78,kaist_dataset/English/Digital_Camera/(E.S)A-sh...,char_data/O/DSC02423_4.jpg,6318,81,82,163,148,226,True,38


In [15]:
dataDF['origin_path'].unique()[18]

'kaist_dataset/English/Digital_Camera/(E.S)B-light/080116-0059.jpg'

In [19]:
for i, k in enumerate(dataDF['origin_path'].unique()):
    if i != 18:
        continue
    data = dataDF.loc[dataDF['origin_path'] == k]
    print(k, len(data))
    for ix, obj in enumerate(data.iterrows()):
        print(ix, float(obj[1]['x1']) - 1, obj[1]['label'])
        
    break

kaist_dataset/English/Digital_Camera/(E.S)B-light/080116-0059.jpg 299
0 136.0 25
1 309.0 38
2 536.0 27
3 716.0 48
4 926.0 42
5 1062.0 31
6 1241.0 38
7 1399.0 39
8 444.0 36
9 580.0 24
10 682.0 27
11 776.0 28
12 892.0 46
13 1030.0 32
14 1058.0 43
15 1137.0 31
16 440.0 39
17 552.0 24
18 694.0 42
19 813.0 42
20 931.0 1
21 983.0 38
22 1101.0 37
23 441.0 606
24 468.0 339
25 562.0 597
26 585.0 417
27 603.0 850
28 535.0 211
29 491.0 370
30 517.0 250
31 673.0 572
32 693.0 476
33 713.0 211
34 630.0 484
35 651.0 465
36 736.0 564
37 757.0 275
38 780.0 891
39 804.0 241
40 826.0 571
41 847.0 633
42 872.0 828
43 892.0 778
44 912.0 848
45 931.0 114
46 952.0 507
47 972.0 667
48 991.0 319
49 1008.0 608
50 1030.0 461
51 1052.0 629
52 1075.0 460
53 1093.0 552
54 1111.0 629
55 518.0 231
56 543.0 390
57 563.0 262
58 586.0 473
59 606.0 628
60 732.0 872
61 754.0 675
62 779.0 807
63 798.0 803
64 818.0 339
65 843.0 589
66 864.0 636
67 885.0 471
68 906.0 459
69 928.0 645
70 961.0 632
71 944.0 259
72 987.0 221
73

In [22]:
import pandas as pd

dataDF = pd.DataFrame(char_data_list)
dataDF['isTrain'] = True




charCount = dataDF['char'].value_counts()
targetChars = charCount.loc[charCount > 100]

datasetDFList = []
for char in targetChars.index:
    charDF = dataDF.loc[dataDF['char'] == char]
    
    train_idx = int(len(charDF) * 0.8)
    
    datasetDFList.append(charDF)
    charDF.iloc[:train_idx].loc[:, 'isTrain'] = True
    charDF.iloc[train_idx:].loc[:, 'isTrain'] = False
    
#     trainCharDF = charDF[:train_idx]
#     testCharDF = charDF[train_idx:]
#     trainCharDF['isTrain'] = True
#     testCharDF['isTrain'] = False
#     datasetDFList.append(trainCharDF)
#     datasetDFList.append(testCharDF)
    
#     print(char, len(charDF), len(charDF[:train_idx]), len(charDF[train_idx:]))

datasetDF = pd.concat(datasetDFList)
datasetDF

charList = sorted(datasetDF['char'].unique())

datasetDF['label'] = datasetDF['char'].apply(lambda char: charList.index(char) + 1)

datasetDF.to_csv('kaist_rcnn.csv', index=False)

trainTestDFList = []

for char in targetChars.index:
    charSampleDF = pd.concat([
        datasetDF.loc[(datasetDF['char'] == char) & (datasetDF['isTrain'] == True)].sample(n=800, replace=True),
        datasetDF.loc[(datasetDF['char'] == char) & (datasetDF['isTrain'] == False)].sample(n=200, replace=True)
    ])
    trainTestDFList.append(charSampleDF)
    
trainTestDF = pd.concat(trainTestDFList)

# shuffle data rows
trainTestDF = trainTestDF.sample(frac=1).reset_index(drop=True)

trainTestDF.to_csv('kaist_dataset.csv')
trainTestDF.loc[trainTestDF['isTrain'] == True].to_csv('kaist_dataset_train.csv', header=False, index=False)
trainTestDF.loc[trainTestDF['isTrain'] == False].to_csv('kaist_dataset_test.csv', header=False, index=False)

trainTestDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,char,height,origin_path,path,seg_areas,width,x1,x2,y1,y2,isTrain,label
0,E,121,kaist_dataset/English/Digital_Camera/(E.S)D-in...,char_data/E/DSC03911_4.jpg,8349,69,200,269,322,443,True,16
1,t,31,kaist_dataset/Mixed/Digital_Camera/(C.S)C-outd...,char_data/t/DSC04069_11.jpg,527,17,430,447,376,407,True,44
2,점,148,kaist_dataset/Mixed/Digital_Camera/(C.S)A-shad...,char_data/점/DSC03207_2.jpg,16872,114,142,256,39,187,True,80
3,기,101,kaist_dataset/Korean/Digital_Camera/outdoor7/P...,char_data/기/P1010021_2.jpg,9898,98,524,622,144,245,True,53
4,시,56,kaist_dataset/Korean/Digital_Camera/outdoor3/D...,char_data/시/DSC02546_16.jpg,2856,51,298,349,20,76,True,70
5,트,116,kaist_dataset/Korean/Digital_Camera/outdoor3/D...,char_data/트/DSC02953_2.jpg,15080,130,434,564,156,272,True,83
6,M,70,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/M/크기변환_P1010028_2.jpg,4690,67,285,352,163,233,True,23
7,7,32,kaist_dataset/Mixed/Digital_Camera/(C.S)E-nigh...,char_data/7/DSC03840_4.jpg,416,13,520,533,208,240,False,9
8,R,42,kaist_dataset/Korean/Digital_Camera/A-shadow/2...,char_data/R/2007-12-09-day-exterior-031_7.jpg,1092,26,428,454,606,648,True,27
9,화,85,kaist_dataset/Mixed/Digital_Camera/(C.S)C-outd...,char_data/화/DSC04092_2.jpg,7055,83,189,272,176,261,True,87


In [11]:
len(trainTestDF.loc[trainTestDF['isTrain'] == False])

17400

In [25]:
np.log(1)

0.0

# Kaist dataset preprocessing for Faster-RCNN
1. 