In [1]:
import numpy as np
import pandas as pd
from keras.models import load_model
from jamotools import Vectorizationer, rules
from unicodedata import normalize

In [2]:
# validate the model on various phonological phenomena

In [3]:
# first load the model
# NOTE: load_model() loads a fully compiled model identical to the one saved using .save()

In [4]:
model = load_model("pronunciation_prediction.h5")

In [5]:
# load the reference DataFrame to ensure that the model wasn't trained on the entries it's being given for evaluation

In [6]:
ref = pd.read_csv("reference_all.csv", sep="\t")
ref.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
0,13943,역사학,역사학,역싸학
1,13943,역사학,역사학이,역싸하기
2,13943,역사학,역사학도,역싸학또
3,13943,역사학,역사학만,역싸항만
4,13957,시대적2,시대적,시대적


In [7]:
# only the spelling column is needed, and store as a set since sets (and dicts) are implemented as hash tables underneath

In [8]:
training_set = set(ref.spelling)

In [9]:
# instantiate the vectorizer for decoding predictions

In [10]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [11]:
# define a function for unvectorizing predictions

In [12]:
decoder = {v: k for k, v in vec.symbol_map.items()}

def unvectorize_norm_pad(vector):
    temp_list = [decoder[num] for num in vector if num != 0]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [13]:
# define a function for comparing items

In [14]:
longest_string = 23

def validate_model(phenomena):
    
    print("  {:15} {:15} {:15}".format("Spelling", "Pronunciation", "Predicted Pronunciation"))
    
    correct = 0
    skipped = 0
    total = len(phenomena)
    
    for instance in phenomena:
        spelling = instance[0]
        pronunciation = instance[1]
        
        if spelling in training_set:
            print()
            print(f"{spelling} was in the training set. skipping.")
            print()
            skipped += 1
            continue
        
        spell_vec = vec.vectorize(spelling)
        
        zeros_to_pad = longest_string - len(spell_vec)
        
        spell_padded = np.pad(spell_vec, (0, zeros_to_pad), "constant")

        spell_padded = spell_padded.reshape(1, spell_padded.shape[0])
        
        prediction = model.predict(spell_padded)[0].argmax(axis=1)
        
        predicted_pronunciation = unvectorize_norm_pad(prediction)
        
        image = "☒"
        
        if pronunciation == predicted_pronunciation:
            correct += 1
            image = "☑"
        
        justify = 16 + len(predicted_pronunciation) - len(pronunciation)*2
        print("{} {:10} {:>5} {}".format(image, spelling, pronunciation, predicted_pronunciation.rjust(justify)))
        
    print()
    print("Accuracy: {:.2f}%".format(100 * correct/(total-skipped)))

In [15]:
#

In [16]:
# test the phenomena

In [17]:
# resyllabification

"When a syllable-final consonant is followed without pause by a vowel in the following syllable, that consonant is carried over to the following syllable to function as its initial consonant in pronunciation. The following syllable may be a part of a suffix or another word. This linking of syllable-final consonant to following syllable in pronunciation is...resyllabification." Cho et al. (2009). Integrated Korean

In [18]:
# NOTE: additional phenomena tested are indicated via line comments

In [19]:
resyllabification = [
    ["한글은", "한그른"],
    ["읽어요", "일거요"],
    ["책을", "채글"],
    ["알았어요", "아라써요"], 
    ["질문이", "질무니"],      
    ["있어요", "이써요"],
    ["읽어", "일거"],
    ["들으세요", "드르세요"],
    ["맞았어요", "마자써요"],
    ["앉으세요", "안즈세요"],
    ["천만에요", "천마네요"],
    ["책이", "채기"],
    ["없어요", "업서요"],
    ["백화점에", "배콰저메"], # aspiration and ㅎ weakening
    ["갔어요", "가써요"],
    ["옷을", "오슬"],
    ["받았어요", "바다써요"],
    ["꽃은", "꼬츤"],
    ["잎이", "이피"],
    ["같아요", "가타요"],
    ["갔어", "가써"],
    ["낮에", "나제"],
    ["빛이", "비치"],
    ["부엌에", "부어케"],
    ["낚아요", "나까요"],
    ["꽃이", "꼬치"]
]

In [20]:
validate_model(resyllabification)

  Spelling        Pronunciation   Predicted Pronunciation
☒ 한글은          한그른           한글ᅳᅳᆫ
☑ 읽어요          일거요           일거요
☑ 책을            채글             채글
☒ 알았어요        아라써요         아라어요
☑ 질문이          질무니           질무니
☒ 있어요          이써요           읻ᅥ요

읽어 was in the training set. skipping.

☑ 들으세요        드르세요         드르세요
☒ 맞았어요        마자써요         마자꺼ᄋ
☑ 앉으세요        안즈세요         안즈세요
☒ 천만에요        천마네요         천다ᄂᄋ요

책이 was in the training set. skipping.

☒ 없어요          업서요           업:ᅥᄋ요
☒ 백화점에        배콰저메         배콰점ᅦ
☒ 갔어요          가써요           가더요
☒ 옷을            오슬             온ᅳᆯ
☒ 받았어요        바다써요         바다어요
☑ 꽃은            꼬츤             꼬츤

잎이 was in the training set. skipping.

☑ 같아요          가타요           가타요
☒ 갔어            가써             가서
☒ 낮에            나제             나ᄌᄌ

빛이 was in the training set. skipping.

☒ 부엌에          부어케           부어커
☑ 낚아요          나까요           나까요

꽃이 was in the training set. skipping.


Accuracy: 38.10%


In [21]:
# syllable-final closure (unrelease)

"At the end of a word or before a consonant, all Korean consonants are pronounced with closure of the speech organs involved, that is, without releasing air. As a result, sound changes occur in consonants in word-final or pre-consonantal position. For example, 꽃은 'as for flowers' is pronounced without any change in ㅊ because the word 꽃 'flower' is immediately followed by the vowel-initial particle 은 'as for'. However, 꽃 'flower' and 꽃도 'flower also' are pronounced 꼳 and 꼳또 respectively. The change of ㅊ to ㄷ here happens because the speech organs (the tongue and the hard palate) responsible for the articulation of the word-final and pre-consonantal ㅊ are not released." 

Lips: ㅂ, ㅍ ==> ㅂ

Gum ridge and hard palate: ㄷ, ㅌ, ㅅ, ㅆ, ㅈ, ㅊ ==> ㄷ

Soft palate: ㄱ, ㅋ, ㄲ ==> ㄱ

Cho et al. (2009). Integrated Korean

In [22]:
unrelease = [
    ["꽃도", "꼳또"],
    ["잎과", "입꽈"],
    ["옷도", "옫또"],
    ["갔지", "갇찌"],
    ["낮과", "낟꽈"],
    ["빛조차", "빋쪼차"],
    ["부엌바닥", "부억빠닥"],
    ["낚다가", "낙따가"],
    ["꽃씨", "꼳씨"]
]

In [23]:
validate_model(unrelease)

  Spelling        Pronunciation   Predicted Pronunciation

꽃도 was in the training set. skipping.

☑ 잎과            입꽈             입꽈

옷도 was in the training set. skipping.

☑ 갔지            갇찌             갇찌
☑ 낮과            낟꽈             낟꽈
☒ 빛조차          빋쪼차           빋쪼자
☑ 부엌바닥        부억빠닥         부억빠닥
☑ 낚다가          낙따가           낙따가

꽃씨 was in the training set. skipping.


Accuracy: 83.33%


In [24]:
# nasal assimilation

"All plosive and fricative consonants become the corresponding nasal consonants before a nasal consonant (ㅁ, ㄴ). Notice that even ㅎ is included in the change."

ㅂ, ㅍ ==> ㅁ

ㄷ, ㅌ, ㅅ, ㅆ, ㅈ, ㅊ, ㅎ ==> ㄴ

ㄱ, ㅋ, ㄲ ==> ㅇ

Cho et al. (2009). Integrated Korean

In [25]:
nasal_assimilation = [
    ["입만", "임만"],
    ["앞문", "암문"],
    ["받는다", "반는다"],
    ["끝나다", "끈나다"],
    ["몇년", "면년"],
    ["있는데", "인는데"],
    ["일학년", "일항년"],
    ["낳는다", "난는다"],
    ["모르겠습니다", "모르겓씀니다"], # unrelease
    ["한국말로", "한궁말로"],
    ["합니까", "함니까"],
    ["끝내겠습니다", "끈내겓씀니다"]  # unrelease
]

In [26]:
validate_model(nasal_assimilation)

  Spelling        Pronunciation   Predicted Pronunciation

입만 was in the training set. skipping.


앞문 was in the training set. skipping.

☒ 받는다          반는다           반든다

끝나다 was in the training set. skipping.

☑ 몇년            면년             면년
☑ 있는데          인는데           인는데
☑ 일학년          일항년           일항년
☑ 낳는다          난는다           난는다
☒ 모르겠습니다     모르겓씀니다     모르겓쓰니다
☒ 한국말로        한궁말로         한:궁망로
☑ 합니까          함니까           함니까
☒ 끝내겠습니다     끈내겓씀니다     끔내걷씀니다

Accuracy: 55.56%
