In [2]:
'''
    2017.09.25
    @asher

    Function: Read original phd08 data and convert to csv files.

    Usage: phd08 dataset 폴더와 동일 레벨에서 실행 -> phd08_output 폴더에 글자별 csv 파일 생성
    (phd08 dataset은 bigdata Google Drive 참고)

    Columns:
        FO(1): Font type (B:바다, D:돋움, G:고딕, H1:한양해서, H2:헤드라인, M:명조, N:나무, S:샘물, Y:엽서)
        FS(1): Font size (0:12, 1:13: 2:14)
        CP(1): The number of copies (0:0, 1:1, 2:2)
        RE(1): Resolution (0:200, 1:240, 2:280)
        TH(1): Threshold (0:140, 1:180, 2:220)
        SL(1): Slope(Rotate) (0:-3deg, 1:0deg, 2:3deg)
        HE(1): Height(pixels)
        WD(1): Width(pixels)
        Korean Character(1) (가, 각, 간, ...)
        Image data(The number of columns = Rows X Cols)
        Label(1): (0:가, 1:각, 2:간 ...)

    Dependency: python3(anaconda), scipy, pillow
'''

import os
import shutil
import csv
import numpy as np
from scipy.misc import imresize
import unicodedata

# parameters
SOURCE_DIR = "/Users/jongoon/Documents/phd08"
TARGET_DIR = "phd08_output_all"
FONT_LIST = ['B', 'D', 'G', 'H1', 'H2', 'M', 'N', 'S', 'Y']
NEW_IMAGE_SIZE = (56, 56)

# if the target directory (for output) already exists, remove it
try:
    shutil.rmtree(TARGET_DIR)
except OSError:
    pass

# make a new target directory
os.mkdir(TARGET_DIR)

valid_files = []

# check the number of valid files from the file list
file_list = [unicodedata.normalize('NFC', f) for f in os.listdir(SOURCE_DIR)]
for file_name in file_list:
    is_in = False
    for idx in range(len('가나다라마바사아자차카타파하')):
        char = '가나다라마바사아자차카타파하'[idx]
        if char in file_name:
            is_in = True
    if not is_in:
        continue
    file_path = os.path.join(SOURCE_DIR, file_name)

    # is file?
    if not os.path.isfile(file_path):
        continue

    # file size = 0?
    if os.path.getsize(file_path) == 0:
        continue

    valid_files.append(file_name)

print("{} valid files from {} files\n".format(len(valid_files),
                                              len(os.listdir(SOURCE_DIR))))

for num, file_name in enumerate(valid_files):
    read_file_path = os.path.join(SOURCE_DIR, file_name)
    write_file_path = os.path.join(TARGET_DIR, file_name[:-4] + '.csv')

    with open(write_file_path, "w") as wf:
        with open(read_file_path, "r") as rf:

            while True:
                line = rf.readline()

                # check if line is EOF
                if not line:
                    break

                # parse image description
                data = line.strip().split("_")

                # parse rows, cols
                rows, cols = rf.readline().strip().split()
                data.append(rows)
                data.append(cols)
                data.append(file_name[:-4])

                # print(data)

                img = []

                # get binary image data as a list
                for i in range(0, int(rows), 1):
                    img.append(list(rf.readline().strip()))

                # list -> numpy array
                img_array = np.asarray(img, dtype=np.uint8) * 256

                # resize the image
                img_array_resized = imresize(img_array,
                                             size=NEW_IMAGE_SIZE,
                                             interp='bilinear',
                                             mode=None)

                # line between each image data
                rf.readline()

                data += img_array_resized.ravel().tolist()
                data.append(num)

                # print(data)

                # only write the data with the font you want to use
                if data[0] in FONT_LIST:
                    csv_wr = csv.writer(wf, dialect='excel')
                    csv_wr.writerow(data)

    print("file #({}) {} created.".format(num + 1, file_name[:-4] + '.csv'))
print("done.")

14 valid files from 2350 files

file #(1) 가.csv created.
file #(2) 나.csv created.
file #(3) 다.csv created.
file #(4) 라.csv created.
file #(5) 마.csv created.
file #(6) 바.csv created.
file #(7) 사.csv created.
file #(8) 아.csv created.
file #(9) 자.csv created.
file #(10) 차.csv created.
file #(11) 카.csv created.
file #(12) 타.csv created.
file #(13) 파.csv created.
file #(14) 하.csv created.
done.


In [70]:

valid_files = []


file_list = [unicodedata.normalize('NFC', f) for f in os.listdir(SOURCE_DIR)]

# check the number of valid files from the file list
for file_name in file_list:
# for file_name in os.listdir(SOURCE_DIR):
    file_name = str(file_name)
    is_in = False
    print('가' in file_name, file_name)
    
    break
    for char in ['가', '나', '다']:
#     for idx in range(len('가나다라마바사아자차카타파하')):
#         char = '가나다라마바사아자차카타파하'[idx]
        print(char, file_name, char in file_name)
        if char in file_name:
            is_in = True
    break
    if not is_in:
        continue
    file_path = os.path.join(SOURCE_DIR, file_name)

    # is file?
    if not os.path.isfile(file_path):
        continue

    # file size = 0?
    if os.path.getsize(file_path) == 0:
        continue

    valid_files.append(file_name)
valid_files

True 가.txt


[]

In [69]:
'가.txt' in file_list

False

In [65]:
print('가' in '가.txt', file_name, '가' == file_name.split('.')[0], '{%s}' % file_name.split('.')[0], '{%s}' % '가', file_name.find('.txt'), len(file_name.split(',')[0]), len('가'))

True 가.txt False {가} {가} 2 6 1
