In [1]:
from bs4 import BeautifulSoup
import urllib.request
import time
import re
import math
import random
import pickle

In [2]:
# BeautifulSoup Documentation:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
#
# visit site to find out basic information like url and number of entries in database

In [3]:
# blockCount = 100 means 100 entries per page so
# 51,961 results means 520 pages of content

In [4]:
# https://krdict.korean.go.kr/eng/dicSearchDetail/searchDetailWordsResult?nation=eng&nationCode=6&searchFlag=Y&sort=W&currentPage=1&ParaWordNo=&syllablePosition=&actCategoryList=&all_gubun=ALL&gubun=W&gubun=P&gubun=E&all_wordNativeCode=ALL&wordNativeCode=1&wordNativeCode=2&wordNativeCode=3&wordNativeCode=0&all_sp_code=ALL&sp_code=1&sp_code=2&sp_code=3&sp_code=4&sp_code=5&sp_code=6&sp_code=7&sp_code=8&sp_code=9&sp_code=10&sp_code=11&sp_code=12&sp_code=13&sp_code=14&sp_code=27&all_imcnt=ALL&imcnt=1&imcnt=2&imcnt=3&imcnt=0&all_multimedia=ALL&multimedia=P&multimedia=I&multimedia=V&multimedia=A&multimedia=S&multimedia=N&searchSyllableStart=&searchSyllableEnd=&searchOp=AND&searchTarget=word&searchOrglanguage=-1&wordCondition=wordAll&query=&blockCount=100

# "Detail search results (51,961 results)"

In [5]:
# figure out how site works:
# links for dictionary entries are actually calls to JavaScript function to
# fill out and submit html form with database ID

In [6]:
# javascript:checkSubmit('27733','Y');

# //내용 보기
# function checkSubmit(word_no,disp_yn){
# 	if(disp_yn == "N"){
# 		alert("Your access to the requested page is forbidden.");
# 		location.href = "/eng/mainAction";
# 	}else{
# 		$("input[name=ParaWordNo]").val(word_no);
# 		$("form[name=searchDetailWordsForm]").attr("action", "/eng/dicSearch/SearchView");
# 	    $("form[name=searchDetailWordsForm]").submit();
# 	}
# }

In [7]:
# define and compile regular expression for finding links with database IDs
# NOTE: re.VERBOSE allows the use of comments within the raw string pattern

In [8]:
raw_pattern = r"""javascript:checkSubmit\(' # characters which mark beginning of database ID
                 ([0-9]*) # database ID, enclosed in parentheses for grouping; matches 0 or more digits
                 ' # character that marks end of database ID
                 """
pattern = re.compile(raw_pattern, re.VERBOSE)

In [9]:
# define a function for creating a list of database IDs from an html document parsed by BeautifulSoup

In [10]:
def get_entry_ids(soup):
    # a = anchor tag. these are used for links within pages
    # href = "hypertext reference"; this attribute contains the url, "the address", at which the resource exists
    links = soup.find_all("a")
    urls = [link.get("href") for link in links]
    
    # define entry_ids as a set to avoid having to remove duplicates later
    entry_ids = set()
    for url in urls:
        entry = re.match(pattern, url)
        if entry:
            entry_ids.add(entry.group(1))
    
    return list(entry_ids)

In [11]:
# program for gathering ids from all pages

In [12]:
entry_ids = []
num_entries = 51961
pause_begin = 3
pause_end = 15

for page in range(1, math.ceil(num_entries/100)+1):
    url = ("https://krdict.korean.go.kr/eng/dicSearchDetail/searchDetailWordsResult?nation=eng&nationCode=6&searchFlag=Y&sort=W&currentPage="
       + str(page) 
       + "&ParaWordNo=&syllablePosition=&actCategoryList=&all_gubun=ALL&gubun=W&gubun=P&gubun=E&all_wordNativeCode=ALL&wordNativeCode=1&wordNativeCode=2&wordNativeCode=3&wordNativeCode=0&all_sp_code=ALL&sp_code=1&sp_code=2&sp_code=3&sp_code=4&sp_code=5&sp_code=6&sp_code=7&sp_code=8&sp_code=9&sp_code=10&sp_code=11&sp_code=12&sp_code=13&sp_code=14&sp_code=27&all_imcnt=ALL&imcnt=1&imcnt=2&imcnt=3&imcnt=0&all_multimedia=ALL&multimedia=P&multimedia=I&multimedia=V&multimedia=A&multimedia=S&multimedia=N&searchSyllableStart=&searchSyllableEnd=&searchOp=AND&searchTarget=word&searchOrglanguage=-1&wordCondition=wordAll&query=&blockCount=100")
    
    response = urllib.request.urlopen(url)
    soup = BeautifulSoup(response, "html.parser")
    temp = get_entry_ids(soup)
    
    # if an error occurs, check this page
    if len(temp) != 100:
        print(f"Number of IDs from page {page} is {len(temp)}.")
    
    entry_ids += temp
    time.sleep(random.randint(pause_begin, pause_end))
    
    # for periodic confirmation that the program hasn't stalled
    if page % 50 == 0:
        print(f"Page number {page} completed.")

Page number 50 completed.
Page number 100 completed.
Page number 150 completed.
Page number 200 completed.
Page number 250 completed.
Page number 300 completed.
Page number 350 completed.
Page number 400 completed.
Page number 450 completed.
Page number 500 completed.
Number of IDs from page 520 is 61.


In [13]:
# perform a couple of quick checks that we got the data we expected

In [14]:
len(entry_ids)

51961

In [15]:
len(set(entry_ids))

51961

In [16]:
# write data to files for later use

In [17]:
with open("entry_ids.data", "wb") as filehandle:
    pickle.dump(entry_ids, filehandle)
filehandle.close()

In [18]:
f = open("entry_ids.txt", "w")
for num, item in enumerate(entry_ids):
    # don't add newline on last line
    if num < num_entries -1: 
        f.write(item + "\n")
    else:
        f.write(item)
f.close()