In [1]:
from bs4 import BeautifulSoup
import urllib.request
import time

In [2]:
url = ("https://krdict.korean.go.kr/eng/dicSearch/SearchView?nation=eng&nationCode=6&searchFlag=Y&sort=W&currentPage=1&ParaWordNo="
       + str(27733)
       + "&syllablePosition=&actCategoryList=&all_gubun=ALL&gubun=W&gubun=P&gubun=E&all_wordNativeCode=ALL&wordNativeCode=1&wordNativeCode=2&wordNativeCode=3&wordNativeCode=0&all_sp_code=ALL&sp_code=1&sp_code=2&sp_code=3&sp_code=4&sp_code=5&sp_code=6&sp_code=7&sp_code=8&sp_code=9&sp_code=10&sp_code=11&sp_code=12&sp_code=13&sp_code=14&sp_code=27&all_imcnt=ALL&imcnt=1&imcnt=2&imcnt=3&imcnt=0&all_multimedia=ALL&multimedia=P&multimedia=I&multimedia=V&multimedia=A&multimedia=S&multimedia=N&searchSyllableStart=&searchSyllableEnd=&searchOp=AND&searchTarget=word&searchOrglanguage=-1&wordCondition=wordAll&query=&blockCount=100")

In [3]:
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response, "html.parser")

In [4]:
# grab basic info for entry
word = soup.h2.font.string
sense = soup.h2.sup.string
word_id = word + sense
level = soup.h2.img.get("alt").strip("level")
pronunciation = soup.p.font.text.strip().strip("[]")
recording_url = soup.p.font.img.get("onclick").lstrip("javascript:fnSoundPlay('").rstrip("');")
ss = soup.h3.stripped_strings
KPOS = next(ss)
EPOS = next(ss)

In [5]:
# find only "Reference" notes
all_refs = soup.find_all(class_="word_detail_view printArea accessArea manyLang6")

# only do this if there is a "Reference" note
if len(all_refs) != 0:
    
    ref_dict = {}
    
    # find all definitions, plus "Reference" notes.
    # the order they appear in in the list is the order they appeared in the page.
    # therefore, the "Reference" note will apply to the definition that precedes it in
    # the list
    all_defs_refs = soup.select(".word_explain_list.printArea.accessArea,.word_detail_view.printArea.accessArea.manyLang6")

    for ref in all_refs:
        reference = ref.ul.li.strong.get_text(strip=True)
        
        for num, item in enumerate(all_defs_refs):
            # get text from each Tag object and see if the reference is a substring of it to
            # find the "Reference"
            if reference in item.get_text():
                # the definition the reference applies to is the previous one
                ref_dict[num-1] = reference
print(ref_dict)

{1: '일부 명사 뒤에 붙여 쓴다.'}


In [6]:
definitions = soup.find_all("ol", class_="senseLayer")

combined_defs = []

for num, definition in enumerate(definitions):
    eng_trans = definition.li.p.get_text(" ", strip=True)
    kor_def = definition.li.span.get_text(strip=True)
    eng_def = definition.select(".sub_p1.manyLang6.multiSenseDef.defFont6")[0].get_text(strip=True)
    
    # if there is an entry in ref_dict for the definition number, insert it here
    # else, insert an empty string
    ref_note = ref_dict.get(num)
    if ref_note is None:
        ref_note = ""
    
    examples = definition.ul.select("li")
    
    phrases = []
    sentences = []
    dialogue = []

    for item in examples:
        if item.get("class") == ["violet"]:
            dialogue += [item.get_text().strip()]
        elif item.get("class") == ["orange"]:
            sentences += [item.get_text()]
        else:
            phrases += [item.get_text()]
            
    combined_defs += [[eng_trans, kor_def, eng_def, phrases, sentences, dialogue, ref_note]]

In [7]:
print(f"word: {word}")
print(f"sense: {sense}")
print(f"word id: {word_id}")
print(f"level: {level}")
print(f"pronunciation: {pronunciation}")
print(f"recording url: {recording_url}")
print(f"KPOS: {KPOS}")
print(f"EPOS: {EPOS}")

for num, x in enumerate(combined_defs, 1):
    print(f"definition {num}: {x}")

word: 가
sense: 1
word id: 가1
level: Advanced 
pronunciation: 가ː
recording url: https://dicmedia.korean.go.kr/multimedia/multimedia_files/convert/20160913/20000/17000/300622/SND000309976.mp3
KPOS: 명사
EPOS: Noun
definition 1: ['1. edge; verge', '어떤 장소나 물건의 둘레나 끝부분.', 'The perimeter or outer limits of a place or a thing.', ['가를 꾸미다.', '가를 장식하다.', '가에 걸치다.', '가에 달다.', '가에 달라붙다.', '가에 세우다.', '가에 앉다.'], ['공원의 중앙에는 잔디밭이 있고 가에는 울타리가 둘러쳐져 있었다.', '민준이는 금방이라도 일어날 듯이 의자 가에 엉덩이만 살짝 걸치고 앉았다.'], ['가: 차는 어디에 주차했어요?\n나: 저기 운동장 가에 세워 뒀어요.'], '']
definition 2: ['2. by; fringe', '‘주변’의 뜻을 나타내는 말.', 'The surrounding area of a place.', ['강가.', '길가.', '냇가.', '문가.', '시냇가.', '우물가.', '창가.', '창문가.', '호숫가.'], [], [], '일부 명사 뒤에 붙여 쓴다.']
