### KAIST Scene Text Database 전처리기:
1. 데이터 url:http://www.iapr-tc11.org/mediawiki/index.php/KAIST_Scene_Text_Database
1. 압축을 다 풀고 'kaist_dataset'아래에 이미지 데이터와 xml 파일을 두면
1. 각 xml 파일에서 글자하나하나 위치정보를 읽어 원본 이미지에서 글자를 별도의 이미지 파일로 저장하는 작업을 수행한다.
1. 결과물은 char_data 아래에 각 글자명으로 폴더를 만들고 그 아래에 원본파일명.글자명.idx.jpg로 저장
    - char_data/{character}/{img_filename}.{character}.{idx}.jpg
    - character가 .(dot)인 경우에는 {character}값을 dot으로 대체하여 처리
    

In [62]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import xml.etree.ElementTree

from os import listdir, path, makedirs
from os.path import isfile, isdir, join, basename

img_base_path = 'kaist_dataset'

def plot_img_inline(img):
    plt.figure()
    plt.imshow(np.asarray(img))
    plt.show()

char_count = dict()

def get_unique_img_name(target_path, index_name, char_name):
    idx = 0
    image_filename = '%s.jpg' %('_'.join([index_name, str(idx)]))
    full_path = join(target_path, image_filename)
    
    while path.exists(full_path):
        idx = idx + 1
        image_filename = '%s.jpg' %('_'.join([index_name, str(idx)]))
        full_path = join(target_path, image_filename)
    return full_path

def print_img(img, base_path, index_name, char_name):
    target_path = join(base_path, char_name)
    if not path.exists(target_path):
        makedirs(target_path)

    # 이미지 데이터들이 여러 폴더에 걸쳐있어서 같은 파일명과 같은 글자를 갖는 경우가 많음
    # 뒤에 idx값을 추가해 서로 다른 데이터가 다른 파일명을 갖도록 함.
    full_path = get_unique_img_name(target_path, index_name, char_name)
    img.save(full_path)
    
    if char_name not in char_count:
        char_count[char_name] = 1
    else:
        char_count[char_name] = char_count[char_name] + 1
        
    return full_path, char_name
    
def crop_character_images(base_path, index_name, output_path='char_data'):
    image_filename = index_name
    xml_filename = '.'.join([path.splitext(index_name)[0], 'xml'])
    
    # xml 파일에 확장자를 빠트린 경우가 많음. xml 파일이 없는 경우에는 xml파일명을 확장자를 제외한 것으로 설정
    if path.exists(join(base_path, xml_filename)) is False:
        xml_filename = index_name
    im = Image.open(join(base_path, image_filename))
    
#     plot_img_inline(im)
    e = xml.etree.ElementTree.parse(join(base_path, xml_filename)).getroot()

    char_result = []
    for image in e:
        for words in image:
            if words.tag == 'words':
                for word in words:
                    for char in word:
                        if len(char.attrib['char']) != 1:
                            continue
                            
                        # 글자가 .인경우 dot으로 대체
                        if char.attrib['char'] == '.':
                            char.attrib['char'] = 'dot'
                            
                        # box 영역으로 crop
                        box = (int(char.attrib['x']), int(char.attrib['y']), int(char.attrib['x']) + int(char.attrib['width']), int(char.attrib['y']) + int(char.attrib['height']))
                        region = im.crop(box)
#                         plot_img_inline(region)

                        width, height = im.size

                        # 파일로 저장
                        (full_path, char_name) = print_img(region, output_path, index_name, char.attrib['char'])
                        char_result.append({
                            'src_width': width,
                            'src_height': height,
                            'path': full_path,
                            'char': char_name,
                            'x1': int(char.attrib['x']),
                            'y1': int(char.attrib['y']),
                            'x2': int(char.attrib['x']) + int(char.attrib['width']),
                            'y2': int(char.attrib['y']) + int(char.attrib['height']),
                            'width': int(char.attrib['width']),
                            'height': int(char.attrib['height']),
                            'seg_areas': int(char.attrib['width']) * int(char.attrib['height']),
                            'origin_path': join(base_path, image_filename)
                        })
    return char_result

from os import listdir, path, makedirs
from os.path import isfile, isdir, join, basename

img_base_path = 'kaist_dataset'

fail_filenames = []

char_data_list = []
def get_recursive_dirlist(base_path):
    global char_data_list
    
    # dirname을 얻어옴
    sub_dirnames = [f for f in listdir(base_path) if isdir(join(base_path, f))]
    for dirname in sub_dirnames:
        # 각 dirname별로  recursive 호출
        next_base_path = join(base_path, dirname)
        get_recursive_dirlist(next_base_path)
        
        # 현재 dir에 jpg파일이 있는지 확인
        # KAIST데이터셋은 xxx.jpg, xxx.xml, xxx.bmp 형식으로 있기 때문에, 
        # jpg파일 목록을 가져와서 이미지 데이터 목록을 확보
        data_filenames = [f for f in listdir(next_base_path) if isfile(join(next_base_path, f)) and f.lower().endswith('.jpg')]        
        for x in data_filenames:
            try:
                char_result = crop_character_images(next_base_path, x)
                char_data_list = char_data_list + char_result
            except:
                # 데이터 형식이 잘못된 경우가 많아 exception이 발생한 경우는 그냥 버림. 대신 뭐가 문제였는지만 출력
                print('###EXCEPTION', next_base_path, x)
                fail_filenames.append((next_base_path, x))
    
get_recursive_dirlist(img_base_path)

char_data_list

###EXCEPTION kaist_dataset/English/Digital_Camera/(E.S)F-others 4.jpg
###EXCEPTION kaist_dataset/Korean/Digital_Camera/E-night 3.jpg
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 080116-0079.jpg
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 1.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 2.jpg
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 27.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 33.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 35.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 38.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/G-others 41.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/indoor1 080119-0001.jpg
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor1 11.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor1 12.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor1 14.JPG
###EXCEPTION kaist_dataset/Korean/Digital_Camera/outdoor1 16.JPG
###EXCE

[{'char': 'T',
  'height': 105,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.JPG',
  'path': 'char_data/T/DSC02317.JPG_2.jpg',
  'seg_areas': 6615,
  'src_height': 480,
  'src_width': 640,
  'width': 63,
  'x1': 101,
  'x2': 164,
  'y1': 225,
  'y2': 330},
 {'char': 'R',
  'height': 94,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.JPG',
  'path': 'char_data/R/DSC02317.JPG_2.jpg',
  'seg_areas': 5546,
  'src_height': 480,
  'src_width': 640,
  'width': 59,
  'x1': 165,
  'x2': 224,
  'y1': 231,
  'y2': 325},
 {'char': 'E',
  'height': 109,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.JPG',
  'path': 'char_data/E/DSC02317.JPG_2.jpg',
  'seg_areas': 6322,
  'src_height': 480,
  'src_width': 640,
  'width': 58,
  'x1': 218,
  'x2': 276,
  'y1': 228,
  'y2': 337},
 {'char': 'N',
  'height': 97,
  'origin_path': 'kaist_dataset/English/Digital_Camera/(E.S)A-shadow/DSC02317.JPG',
  'path': 'char_

In [78]:
import pandas as pd

dataDF = pd.DataFrame(char_data_list)
dataDF['isTrain'] = True

charCount = dataDF['char'].value_counts()
targetChars = charCount.loc[charCount > 100]

datasetDFList = []
for char in targetChars.index:
    charDF = dataDF.loc[dataDF['char'] == char]
    
    train_idx = int(len(charDF) * 0.8)
    
    datasetDFList.append(charDF)
    charDF.iloc[:train_idx].loc[:, 'isTrain'] = True
    charDF.iloc[train_idx:].loc[:, 'isTrain'] = False
    
#     trainCharDF = charDF[:train_idx]
#     testCharDF = charDF[train_idx:]
#     trainCharDF['isTrain'] = True
#     testCharDF['isTrain'] = False
#     datasetDFList.append(trainCharDF)
#     datasetDFList.append(testCharDF)
    
#     print(char, len(charDF), len(charDF[:train_idx]), len(charDF[train_idx:]))

datasetDF = pd.concat(datasetDFList)
datasetDF

charList = sorted(datasetDF['char'].unique())

datasetDF['label'] = datasetDF['char'].apply(lambda char: charList.index(char) + 1)

datasetDF.loc[(datasetDF['x1'] == 0), 'x1'] = 1
datasetDF.loc[(datasetDF['y1'] == 0), 'y1'] = 1
datasetDF.loc[(datasetDF['x2'] >= datasetDF['src_width']), 'x2'] = datasetDF.loc[(datasetDF['x2'] >= datasetDF['src_width']), 'src_width'] - 1
datasetDF.loc[(datasetDF['y2'] >= datasetDF['src_height']), 'y2'] = datasetDF.loc[(datasetDF['y2'] >= datasetDF['src_height']), 'src_height'] - 1

datasetDF = datasetDF.loc[(datasetDF['x1'] < datasetDF['x2']) & (datasetDF['y1'] < datasetDF['y2'])]

datasetDF.to_csv('kaist_rcnn.csv', index=False)

trainTestDFList = []

for char in targetChars.index:
    charSampleDF = pd.concat([
        datasetDF.loc[(datasetDF['char'] == char) & (datasetDF['isTrain'] == True)].sample(n=800, replace=True),
        datasetDF.loc[(datasetDF['char'] == char) & (datasetDF['isTrain'] == False)].sample(n=200, replace=True)
    ])
    trainTestDFList.append(charSampleDF)
    
trainTestDF = pd.concat(trainTestDFList)

# shuffle data rows
trainTestDF = trainTestDF.sample(frac=1).reset_index(drop=True)

trainTestDF.to_csv('kaist_dataset.csv')
trainTestDF.loc[trainTestDF['isTrain'] == True].to_csv('kaist_dataset_train.csv', header=False, index=False)
trainTestDF.loc[trainTestDF['isTrain'] == False].to_csv('kaist_dataset_test.csv', header=False, index=False)

trainTestDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,char,height,origin_path,path,seg_areas,src_height,src_width,width,x1,x2,y1,y2,isTrain,label
0,y,82,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/y/DSC04437.JPG_11.jpg,4100,480,640,50,258,308,118,200,True,47
1,O,42,kaist_dataset/English/Digital_Camera/(E.S)F-ot...,char_data/O/2007-12-09-day-exterior-047.jpg_10...,1134,1200,1600,27,663,690,1070,1112,True,25
2,라,39,kaist_dataset/Mixed/Digital_Camera/(C.S)C-outd...,char_data/라/DSC02350.JPG_2.jpg,1521,480,640,39,520,559,351,390,True,58
3,-,18,kaist_dataset/Mixed/Digital_Camera/(C.S)C-outd...,char_data/-/081.JPG_2.jpg,324,480,640,18,274,292,265,283,True,1
4,T,40,kaist_dataset/Mixed/Digital_Camera/(C.S)C-outd...,char_data/T/DSC02938.JPG_2.jpg,1240,480,640,31,381,412,240,280,True,29
5,부,32,kaist_dataset/English/Digital_Camera/(E.S)B-li...,char_data/부/080116-0059.jpg_7.jpg,608,1200,1600,19,565,584,996,1028,True,62
6,D,100,kaist_dataset/Mixed/Digital_Camera/(C.S)A-shad...,char_data/D/DSC02609.JPG_12.jpg,7800,480,640,78,275,353,191,291,True,15
7,d,49,kaist_dataset/English/Digital_Camera/(E.S)D-in...,char_data/d/080119-0009.jpg_13.jpg,980,1200,1600,20,657,677,1028,1077,True,33
8,전,30,kaist_dataset/Korean/Digital_Camera/outdoor1/3...,char_data/전/39.jpg_2.jpg,750,197,320,25,186,211,39,69,True,79
9,N,24,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/N/DSC02500.JPG_12.jpg,480,480,640,20,1,20,82,106,True,24


In [46]:
datasetDF.apply(lambda x: x['x1'] > 0 and x['x2'] < x['src_width'] and x['y1'] > 0 and x['y2'] < x['src_height'] and x['x1'] < x['x2'] and x['y1'] < x['y2'] , axis=1).unique()

array([ True, False], dtype=bool)

In [47]:
datasetDF.loc[(datasetDF['x1'] > 0) & (datasetDF['x2'] < datasetDF['src_width']) & (datasetDF['y1'] > 0) & (datasetDF['y2'] < datasetDF['src_height']) & (datasetDF['x1'] < datasetDF['x2']) & (datasetDF['y1'] < datasetDF['y2'])]

Unnamed: 0,char,height,origin_path,path,seg_areas,src_height,src_width,width,x1,x2,y1,y2,isTrain,label
766,8,24,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/042.JPG_1.jpg,456,480,640,19,485,504,321,345,True,10
812,8,27,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/056.JPG_1.jpg,432,480,640,16,513,529,249,276,True,10
902,8,32,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/077.JPG_1.jpg,544,480,640,17,276,293,248,280,True,10
1455,8,37,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_3.jpg,518,480,640,14,325,339,350,387,True,10
1460,8,36,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_4.jpg,756,480,640,21,486,507,353,389,True,10
1463,8,34,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_5.jpg,544,480,640,16,549,565,353,387,True,10
1554,8,44,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02415.JPG_1.jpg,1144,480,640,26,393,419,250,294,True,10
1648,8,42,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_3.jpg,1386,480,640,33,109,142,222,264,True,10
1651,8,43,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_4.jpg,1462,480,640,34,265,299,222,265,True,10
1653,8,41,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_5.jpg,1189,480,640,29,353,382,221,262,True,10


In [76]:
# datasetDF.loc[(datasetDF['x1'] > 0) & (datasetDF['x2'] < datasetDF['src_width']) & (datasetDF['y1'] > 0) & (datasetDF['y2'] < datasetDF['src_height']) & (datasetDF['x1'] < datasetDF['x2']) & (datasetDF['y1'] < datasetDF['y2'])]

datasetDF.loc[(datasetDF['x1'] == 0), 'x1'] = 1
datasetDF.loc[(datasetDF['y1'] == 0), 'y1'] = 1
datasetDF.loc[(datasetDF['x2'] >= datasetDF['src_width']), 'x2'] = datasetDF.loc[(datasetDF['x2'] >= datasetDF['src_width']), 'src_width'] - 1
datasetDF.loc[(datasetDF['y2'] >= datasetDF['src_height']), 'y2'] = datasetDF.loc[(datasetDF['y2'] >= datasetDF['src_height']), 'src_height'] - 1

# datasetDF.loc[(datasetDF['x1'] < datasetDF['x2']) & (datasetDF['y1'] < datasetDF['y2'])]

In [77]:
# datasetDF.loc[(datasetDF['x1'] > 0) == False, 'x1'] = 1

datasetDF.loc[(datasetDF['x2'] >= datasetDF['src_width'])]
# datasetDF.loc[(datasetDF['y1'] == 0)]

Unnamed: 0,char,height,origin_path,path,seg_areas,src_height,src_width,width,x1,x2,y1,y2,isTrain,label


In [79]:
datasetDF

Unnamed: 0,char,height,origin_path,path,seg_areas,src_height,src_width,width,x1,x2,y1,y2,isTrain,label
766,8,24,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/042.JPG_2.jpg,456,480,640,19,485,504,321,345,True,10
812,8,27,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/056.JPG_2.jpg,432,480,640,16,513,529,249,276,True,10
902,8,32,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/077.JPG_2.jpg,544,480,640,17,276,293,248,280,True,10
1455,8,37,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_6.jpg,518,480,640,14,325,339,350,387,True,10
1460,8,36,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_7.jpg,756,480,640,21,486,507,353,389,True,10
1463,8,34,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02364.JPG_8.jpg,544,480,640,16,549,565,353,387,True,10
1554,8,44,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02415.JPG_2.jpg,1144,480,640,26,393,419,250,294,True,10
1648,8,42,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_6.jpg,1386,480,640,33,109,142,222,264,True,10
1651,8,43,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_7.jpg,1462,480,640,34,265,299,222,265,True,10
1653,8,41,kaist_dataset/English/Digital_Camera/(E.S)C-ou...,char_data/8/DSC02450.JPG_8.jpg,1189,480,640,29,353,382,221,262,True,10


In [25]:
np.log(1)

0.0

# Kaist dataset preprocessing for Faster-RCNN
1. 