In [7]:
'''
    2017.09.25
    @asher

    Function: Read original phd08 data and convert to csv files.

    Usage: phd08 dataset 폴더와 동일 레벨에서 실행 -> phd08_output 폴더에 글자별 csv 파일 생성
    (phd08 dataset은 bigdata Google Drive 참고)

    Columns:
        FO(1): Font type (B:바다, D:돋움, G:고딕, H1:한양해서, H2:헤드라인, M:명조, N:나무, S:샘물, Y:엽서)
        FS(1): Font size (0:12, 1:13: 2:14)
        CP(1): The number of copies (0:0, 1:1, 2:2)
        RE(1): Resolution (0:200, 1:240, 2:280)
        TH(1): Threshold (0:140, 1:180, 2:220)
        SL(1): Slope(Rotate) (0:-3deg, 1:0deg, 2:3deg)
        HE(1): Height(pixels)
        WD(1): Width(pixels)
        Korean Character(1) (가, 각, 간, ...)
        Image data(The number of columns = Rows X Cols)
        Label(1): (0:가, 1:각, 2:간 ...)

    Dependency: python3(anaconda), scipy, pillow
'''

import os
import shutil
import csv
import numpy as np
from scipy.misc import imresize

# parameters
SOURCE_DIR = "/Users/ali.jeon/Documents/OCR/phd08"
TARGET_DIR = "phd08_output"
FONT_LIST = ['B', 'D', 'G', 'H1', 'H2', 'M', 'N', 'S', 'Y']
NEW_IMAGE_SIZE = (56, 56)

# if the target directory (for output) already exists, remove it
try:
    shutil.rmtree(TARGET_DIR)
except OSError:
    pass

# make a new target directory
os.mkdir(TARGET_DIR)

valid_files = []

# check the number of valid files from the file list
for file_name in os.listdir(SOURCE_DIR):
    file_path = os.path.join(SOURCE_DIR, file_name)

    # is file?
    if not os.path.isfile(file_path):
        continue

    # file size = 0?
    if os.path.getsize(file_path) == 0:
        continue

    valid_files.append(file_name)

print("{} valid files from {} files\n".format(len(valid_files),
                                              len(os.listdir(SOURCE_DIR))))

for num, file_name in enumerate(valid_files):
    read_file_path = os.path.join(SOURCE_DIR, file_name)
    write_file_path = os.path.join(TARGET_DIR, file_name[:-4] + '.csv')

    with open(write_file_path, "w") as wf:
        with open(read_file_path, "r") as rf:

            while True:
                line = rf.readline()

                # check if line is EOF
                if not line:
                    break

                # parse image description
                data = line.strip().split("_")

                # parse rows, cols
                rows, cols = rf.readline().strip().split()
                data.append(rows)
                data.append(cols)
                data.append(file_name[:-4])

                # print(data)

                img = []

                # get binary image data as a list
                for i in range(0, int(rows), 1):
                    img.append(list(rf.readline().strip()))

                # list -> numpy array
                img_array = np.asarray(img, dtype=np.uint8)

                # resize the image
                img_array_resized = imresize(img_array,
                                             size=NEW_IMAGE_SIZE,
                                             interp='bilinear',
                                             mode=None)

                # line between each image data
                rf.readline()

                data += img_array_resized.ravel().tolist()
                data.append(num)

                # print(data)

                # only write the data with the font you want to use
                if data[0] in FONT_LIST:
                    csv_wr = csv.writer(wf, dialect='excel')
                    csv_wr.writerow(data)

    print("file #({}) {} created.".format(num + 1, file_name[:-4] + '.csv'))
print("done.")

2350 valid files from 2350 files

file #(1) 가.csv created.
file #(2) 각.csv created.
file #(3) 간.csv created.
file #(4) 갇.csv created.
file #(5) 갈.csv created.
file #(6) 갉.csv created.
file #(7) 갊.csv created.
file #(8) 감.csv created.
file #(9) 갑.csv created.
file #(10) 값.csv created.
file #(11) 갓.csv created.
file #(12) 갔.csv created.
file #(13) 강.csv created.
file #(14) 갖.csv created.
file #(15) 갗.csv created.
file #(16) 같.csv created.
file #(17) 갚.csv created.
file #(18) 갛.csv created.
file #(19) 개.csv created.
file #(20) 객.csv created.
file #(21) 갠.csv created.
file #(22) 갤.csv created.
file #(23) 갬.csv created.
file #(24) 갭.csv created.
file #(25) 갯.csv created.
file #(26) 갰.csv created.
file #(27) 갱.csv created.
file #(28) 갸.csv created.
file #(29) 갹.csv created.
file #(30) 갼.csv created.
file #(31) 걀.csv created.
file #(32) 걋.csv created.
file #(33) 걍.csv created.
file #(34) 걔.csv created.
file #(35) 걘.csv created.

file #(288) 낌.csv created.
file #(289) 낍.csv created.
file #(290) 낏.csv created.
file #(291) 낑.csv created.
file #(292) 나.csv created.
file #(293) 낙.csv created.
file #(294) 낚.csv created.
file #(295) 난.csv created.
file #(296) 낟.csv created.
file #(297) 날.csv created.
file #(298) 낡.csv created.
file #(299) 낢.csv created.
file #(300) 남.csv created.
file #(301) 납.csv created.
file #(302) 낫.csv created.
file #(303) 났.csv created.
file #(304) 낭.csv created.
file #(305) 낮.csv created.
file #(306) 낯.csv created.
file #(307) 낱.csv created.
file #(308) 낳.csv created.
file #(309) 내.csv created.
file #(310) 낵.csv created.
file #(311) 낸.csv created.
file #(312) 낼.csv created.
file #(313) 냄.csv created.
file #(314) 냅.csv created.
file #(315) 냇.csv created.
file #(316) 냈.csv created.
file #(317) 냉.csv created.
file #(318) 냐.csv created.
file #(319) 냑.csv created.
file #(320) 냔.csv created.
file #(321) 냘.csv created.
file #(322) 냠.c

file #(572) 땍.csv created.
file #(573) 땐.csv created.
file #(574) 땔.csv created.
file #(575) 땜.csv created.
file #(576) 땝.csv created.
file #(577) 땟.csv created.
file #(578) 땠.csv created.
file #(579) 땡.csv created.
file #(580) 떠.csv created.
file #(581) 떡.csv created.
file #(582) 떤.csv created.
file #(583) 떨.csv created.
file #(584) 떪.csv created.
file #(585) 떫.csv created.
file #(586) 떰.csv created.
file #(587) 떱.csv created.
file #(588) 떳.csv created.
file #(589) 떴.csv created.
file #(590) 떵.csv created.
file #(591) 떻.csv created.
file #(592) 떼.csv created.
file #(593) 떽.csv created.
file #(594) 뗀.csv created.
file #(595) 뗄.csv created.
file #(596) 뗌.csv created.
file #(597) 뗍.csv created.
file #(598) 뗏.csv created.
file #(599) 뗐.csv created.
file #(600) 뗑.csv created.
file #(601) 뗘.csv created.
file #(602) 뗬.csv created.
file #(603) 또.csv created.
file #(604) 똑.csv created.
file #(605) 똔.csv created.
file #(606) 똘.cs

file #(856) 묫.csv created.
file #(857) 무.csv created.
file #(858) 묵.csv created.
file #(859) 묶.csv created.
file #(860) 문.csv created.
file #(861) 묻.csv created.
file #(862) 물.csv created.
file #(863) 묽.csv created.
file #(864) 묾.csv created.
file #(865) 뭄.csv created.
file #(866) 뭅.csv created.
file #(867) 뭇.csv created.
file #(868) 뭉.csv created.
file #(869) 뭍.csv created.
file #(870) 뭏.csv created.
file #(871) 뭐.csv created.
file #(872) 뭔.csv created.
file #(873) 뭘.csv created.
file #(874) 뭡.csv created.
file #(875) 뭣.csv created.
file #(876) 뭬.csv created.
file #(877) 뮈.csv created.
file #(878) 뮌.csv created.
file #(879) 뮐.csv created.
file #(880) 뮤.csv created.
file #(881) 뮨.csv created.
file #(882) 뮬.csv created.
file #(883) 뮴.csv created.
file #(884) 뮷.csv created.
file #(885) 므.csv created.
file #(886) 믄.csv created.
file #(887) 믈.csv created.
file #(888) 믐.csv created.
file #(889) 믓.csv created.
file #(890) 미.csv c

file #(1136) 섀.csv created.
file #(1137) 섄.csv created.
file #(1138) 섈.csv created.
file #(1139) 섐.csv created.
file #(1140) 섕.csv created.
file #(1141) 서.csv created.
file #(1142) 석.csv created.
file #(1143) 섞.csv created.
file #(1144) 섟.csv created.
file #(1145) 선.csv created.
file #(1146) 섣.csv created.
file #(1147) 설.csv created.
file #(1148) 섦.csv created.
file #(1149) 섧.csv created.
file #(1150) 섬.csv created.
file #(1151) 섭.csv created.
file #(1152) 섯.csv created.
file #(1153) 섰.csv created.
file #(1154) 성.csv created.
file #(1155) 섶.csv created.
file #(1156) 세.csv created.
file #(1157) 섹.csv created.
file #(1158) 센.csv created.
file #(1159) 셀.csv created.
file #(1160) 셈.csv created.
file #(1161) 셉.csv created.
file #(1162) 셋.csv created.
file #(1163) 셌.csv created.
file #(1164) 셍.csv created.
file #(1165) 셔.csv created.
file #(1166) 셕.csv created.
file #(1167) 션.csv created.
file #(1168) 셜.csv created.
file #(1169) ᄉ

file #(1411) 에.csv created.
file #(1412) 엑.csv created.
file #(1413) 엔.csv created.
file #(1414) 엘.csv created.
file #(1415) 엠.csv created.
file #(1416) 엡.csv created.
file #(1417) 엣.csv created.
file #(1418) 엥.csv created.
file #(1419) 여.csv created.
file #(1420) 역.csv created.
file #(1421) 엮.csv created.
file #(1422) 연.csv created.
file #(1423) 열.csv created.
file #(1424) 엶.csv created.
file #(1425) 엷.csv created.
file #(1426) 염.csv created.
file #(1427) 엽.csv created.
file #(1428) 엾.csv created.
file #(1429) 엿.csv created.
file #(1430) 였.csv created.
file #(1431) 영.csv created.
file #(1432) 옅.csv created.
file #(1433) 옆.csv created.
file #(1434) 옇.csv created.
file #(1435) 예.csv created.
file #(1436) 옌.csv created.
file #(1437) 옐.csv created.
file #(1438) 옘.csv created.
file #(1439) 옙.csv created.
file #(1440) 옛.csv created.
file #(1441) 옜.csv created.
file #(1442) 오.csv created.
file #(1443) 옥.csv created.
file #(1444) ᄋ

file #(1686) 진.csv created.
file #(1687) 짇.csv created.
file #(1688) 질.csv created.
file #(1689) 짊.csv created.
file #(1690) 짐.csv created.
file #(1691) 집.csv created.
file #(1692) 짓.csv created.
file #(1693) 징.csv created.
file #(1694) 짖.csv created.
file #(1695) 짙.csv created.
file #(1696) 짚.csv created.
file #(1697) 짜.csv created.
file #(1698) 짝.csv created.
file #(1699) 짠.csv created.
file #(1700) 짢.csv created.
file #(1701) 짤.csv created.
file #(1702) 짧.csv created.
file #(1703) 짬.csv created.
file #(1704) 짭.csv created.
file #(1705) 짯.csv created.
file #(1706) 짰.csv created.
file #(1707) 짱.csv created.
file #(1708) 째.csv created.
file #(1709) 짹.csv created.
file #(1710) 짼.csv created.
file #(1711) 쨀.csv created.
file #(1712) 쨈.csv created.
file #(1713) 쨉.csv created.
file #(1714) 쨋.csv created.
file #(1715) 쨌.csv created.
file #(1716) 쨍.csv created.
file #(1717) 쨔.csv created.
file #(1718) 쨘.csv created.
file #(1719) 

file #(1961) 쿰.csv created.
file #(1962) 쿱.csv created.
file #(1963) 쿳.csv created.
file #(1964) 쿵.csv created.
file #(1965) 쿼.csv created.
file #(1966) 퀀.csv created.
file #(1967) 퀄.csv created.
file #(1968) 퀑.csv created.
file #(1969) 퀘.csv created.
file #(1970) 퀭.csv created.
file #(1971) 퀴.csv created.
file #(1972) 퀵.csv created.
file #(1973) 퀸.csv created.
file #(1974) 퀼.csv created.
file #(1975) 큄.csv created.
file #(1976) 큅.csv created.
file #(1977) 큇.csv created.
file #(1978) 큉.csv created.
file #(1979) 큐.csv created.
file #(1980) 큔.csv created.
file #(1981) 큘.csv created.
file #(1982) 큠.csv created.
file #(1983) 크.csv created.
file #(1984) 큭.csv created.
file #(1985) 큰.csv created.
file #(1986) 클.csv created.
file #(1987) 큼.csv created.
file #(1988) 큽.csv created.
file #(1989) 킁.csv created.
file #(1990) 키.csv created.
file #(1991) 킥.csv created.
file #(1992) 킨.csv created.
file #(1993) 킬.csv created.
file #(1994) 킴

file #(2236) 헛.csv created.
file #(2237) 헝.csv created.
file #(2238) 헤.csv created.
file #(2239) 헥.csv created.
file #(2240) 헨.csv created.
file #(2241) 헬.csv created.
file #(2242) 헴.csv created.
file #(2243) 헵.csv created.
file #(2244) 헷.csv created.
file #(2245) 헹.csv created.
file #(2246) 혀.csv created.
file #(2247) 혁.csv created.
file #(2248) 현.csv created.
file #(2249) 혈.csv created.
file #(2250) 혐.csv created.
file #(2251) 협.csv created.
file #(2252) 혓.csv created.
file #(2253) 혔.csv created.
file #(2254) 형.csv created.
file #(2255) 혜.csv created.
file #(2256) 혠.csv created.
file #(2257) 혤.csv created.
file #(2258) 혭.csv created.
file #(2259) 호.csv created.
file #(2260) 혹.csv created.
file #(2261) 혼.csv created.
file #(2262) 홀.csv created.
file #(2263) 홅.csv created.
file #(2264) 홈.csv created.
file #(2265) 홉.csv created.
file #(2266) 홋.csv created.
file #(2267) 홍.csv created.
file #(2268) 홑.csv created.
file #(2269) ᄒ

In [6]:
data

['H2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '51',
 '51',
 '가',
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1