In [1]:
import csv
import re
import pandas as pd
from bs4 import BeautifulSoup
from glob import glob

https://krdict.korean.go.kr/eng/dicSearch/SearchView?nation=eng&ParaWordNo=

In [2]:
path = "./pages/"

In [3]:
entries = set()

for name in glob(path + "*.html"):
    entries.add(name)
    
len(entries)

51961

In [4]:
out = open("pronunciation_all.csv", "w", encoding="utf-8")
csv_out = csv.writer(out, delimiter="\t")
csv_out.writerow(["entry_id", "word_id", "spelling", "pronunciation"])

# store info on words skipped
missing_data = []

for entry in sorted(entries):
    
    # grab page ID from filename, then parse the file
    entry_id = entry[8:-5]
    
    file = open(entry, "r", encoding="utf-8-sig")
    
    soup = BeautifulSoup(file, "html.parser")
    
    # get data
    word = soup.h2.font.string
    if soup.h2.sup is not None:
        sense = soup.h2.sup.string
    else:
        sense = ""
    word_id = word + sense
    

    pronunciation = "NONE"
    application = []
    
    # grab the area of interest, then search for the appropriate sections
    heading = soup.select(".keyboardLayerPosition.gray_heading.word_explain_tit.accessArea")[0]
    
    pron_tag = heading.find(string="Pronunciation")
    appl_tag = heading.find(string="Application")

    # if the sections exist on the page, grab the values
    if pron_tag is not None:
        pronunciation = pron_tag.parent.parent.font.text.strip().strip("[]")
        
    if appl_tag is not None:
         # rather than using the string method .split(), re.split() allows for splitting on multiple characters
         # re pattern = split on commas or left parentheses; NOTE use of "or" symbol (|) and escape character "\"
        application = [x.strip().strip("])").split("[") for x in re.split(",|\(", appl_tag.parent.parent.font.text)]
    
    # write to file
    csv_out.writerow([entry_id, word_id, word, pronunciation])
    
    if len(application) != 0:
        for instance in application:
            if len(instance) == 2:
                csv_out.writerow([entry_id, word_id, instance[0], instance[1]])
            else:
                missing_data += [[entry_id, instance[0]]]
                
    file.close()
                
out.close()

In [5]:
len(missing_data)

2787

In [6]:
missing_data[:100]

[['13961', '시드니'],
 ['14085', '실려'],
 ['14085', '실리니'],
 ['14091', '열려'],
 ['14091', '열리니'],
 ['14379', '데려와'],
 ['14379', '데려오니'],
 ['14379', '데려오너라'],
 ['14380', '데우어'],
 ['14380', '데워'],
 ['14380', '데우니'],
 ['14392', '도망치니'],
 ['14433', '때려'],
 ['14433', '때리니'],
 ['14437', '뛰노니'],
 ['14456', '떠'],
 ['14456', '뜨니'],
 ['14495', '버려지니'],
 ['14497', '버텨'],
 ['14497', '버티니'],
 ['14504', '보내와'],
 ['14504', '보내오니'],
 ['14508', '보살펴'],
 ['14508', '보살피니'],
 ['14545', '가빠지니'],
 ['14579', '갈겨'],
 ['14579', '갈기니'],
 ['14626', '마니'],
 ['14628', '말려'],
 ['14628', '말리니'],
 ['14630', '말려'],
 ['14630', '말리니'],
 ['14649', '꾸려'],
 ['14649', '꾸리니'],
 ['14675', '가셔'],
 ['14675', '가시니'],
 ['14692', '갈라놔'],
 ['14701', '갈라지니'],
 ['14708', '갈려'],
 ['14708', '갈리니'],
 ['14709', '갈려'],
 ['14709', '갈리니'],
 ['14710', '갈려'],
 ['14710', '갈리니'],
 ['14711', '갈려'],
 ['14711', '갈리니'],
 ['14879', '안겨'],
 ['14879', '안기니'],
 ['14884', '안겨'],
 ['14884', '안기니'],
 ['14928', '둘러'],
 ['14928', '두르니'],
 ['14957', '두려워하여'],
 ['