In [1]:
import numpy as np
import pandas as pd
from keras.models import load_model
from jamotools import Vectorizationer, rules
from unicodedata import normalize

In [2]:
# validate the model on various phonological phenomena

In [3]:
# first load the model
# NOTE: load_model() loads a fully compiled model identical to the one saved using .save()

In [4]:
model = load_model("pronunciation_prediction.h5")

In [5]:
# load the reference DataFrame to ensure that the model wasn't trained on the entries it's being given for evaluation

In [6]:
ref = pd.read_csv("reference_all.csv", sep="\t")
ref.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
0,13943,역사학,역사학,역싸학
1,13943,역사학,역사학이,역싸하기
2,13943,역사학,역사학도,역싸학또
3,13943,역사학,역사학만,역싸항만
4,13957,시대적2,시대적,시대적


In [7]:
# only the spelling column is needed, and store as a set since sets (and dicts) are implemented as hash tables underneath;
# additionally, using a set will remove duplicates from the spelling column created when alternative pronunciations were
# added (e.g., spellings with multiple pronunciations delimited by forward slashes and split during preprocessing)

In [8]:
ref[ref.duplicated(subset="spelling")].head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
5,13958,시대적1,시대적,시대적
65,13998,신기하다2,신기하다,신기하다
66,13998,신기하다2,신기한,신기한
67,13998,신기하다2,신기하여,신기하여
68,13998,신기하다2,신기해,신기해


In [9]:
ref[ref.spelling == "시대적"]

Unnamed: 0,entry_id,word_id,spelling,pronunciation
4,13957,시대적2,시대적,시대적
5,13958,시대적1,시대적,시대적


In [10]:
training_set = set(ref.spelling)

In [11]:
# instantiate the vectorizer for decoding predictions

In [12]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [13]:
# define a function for unvectorizing predictions

In [14]:
decoder = {v: k for k, v in vec.symbol_map.items()}

def unvectorize_norm_pad(vector):
    temp_list = [decoder[num] for num in vector if num != 0]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [15]:
# define a function for comparing items

In [16]:
longest_string = 23

def validate_model(phenomena, training_set=training_set):
    
    print("   {:18} {:15} {:15} {:20}".format("IPA", "Spelling", "Pronunciation", "Predicted Pronunciation"))
    
    perfect_correct = 0
    syllables_correct = 0
    skipped_words = 0
    total_words = len(phenomena)
    syllables_considered = 0
    
    for instance in phenomena:
        spelling = instance[0]
        pronunciation = instance[1]
        ipa = instance[2]
        
        if spelling in training_set:
            print()
            print(f"{spelling} was in the training set. skipping.")
            print()
            skipped_words += 1
            continue
        
        spell_vec = vec.vectorize(spelling)
        
        zeros_to_pad = longest_string - len(spell_vec)
        
        spell_padded = np.pad(spell_vec, (0, zeros_to_pad), "constant")

        spell_padded = spell_padded.reshape(1, spell_padded.shape[0])
        
        prediction = model.predict(spell_padded)[0].argmax(axis=1)
        
        predicted_pronunciation = unvectorize_norm_pad(prediction)
        
        image = "☒"
        
        num_syllables = len(pronunciation)
        syllables_considered += num_syllables
        
        
        if pronunciation == predicted_pronunciation:
            perfect_correct += 1
            syllables_correct += num_syllables
            image = "☑"
        else:
            # NOTE: zip will return an iterator of tuples whose number is equal to the number of syllables
            #       in the shortest string
            for syllable in zip(pronunciation, predicted_pronunciation):
                if syllable[0] == syllable[1]:
                    syllables_correct += 1
        
        justification = 16 + len(predicted_pronunciation) - len(pronunciation)*2
        print("{} {:20} {:10} {:>5} {}".format(image, ipa, spelling, pronunciation, 
                                               predicted_pronunciation.rjust(justification)))
        
    print()
    print("Perfect Accuracy:       {:10.2f}%".format(100 * perfect_correct/(total_words-skipped_words)))
    print("Syllable Accuracy:      {:10.2f}%".format(100 * syllables_correct/syllables_considered))
    print("Segment (Letter) Accuracy:   XX.XX%")

In [17]:
#

In [18]:
# test the phenomena

In [19]:
# resyllabification

"When a syllable-final consonant is followed without pause by a vowel in the following syllable, that consonant is carried over to the following syllable to function as its initial consonant in pronunciation. The following syllable may be a part of a suffix or another word. This linking of syllable-final consonant to following syllable in pronunciation is...resyllabification." Cho et al. (2009). Integrated Korean

In [20]:
# NOTE: additional phenomena tested are indicated via line comments

In [21]:
resyllabification = [
    ["한글은", "한그른", "hangɯrɯn"],
    ["읽어요", "일거요", "ilgʌyo"],
    ["책을", "채글", "tʃʰɛgɯl"],
    ["알았어요", "아라써요", "aras̕ʌyo"], 
    ["질문이", "질무니", "tʃilmuɲi"],      
    ["있어요", "이써요", "is̕ʌyo"],
    ["읽어", "일거", "ilgʌ"],
    ["들으세요", "드르세요", "tɯrɯsʰeyo"],
    ["맞았어요", "마자써요", "madʒas̕ʌyo"],
    ["앉으세요", "안즈세요", "andʒɯsʰeyo"],
    ["천만에요", "천마네요", "tʃʰʌnmaneyo"],
    ["책이", "채기", "tʃʰɛgi"],
    ["없어요", "업써요", "ʌp̚s̕ʌyo"],
    ["백화점에", "배콰저메", "pɛkʰwadʒʌme"], # aspiration and ㅎ weakening
    ["갔어요", "가써요", "kas̕ʌyo"],
    ["옷을", "오슬", "osʰɯl"],
    ["받았어요", "바다써요", "padas̕ʌyo"],
    ["꽃은", "꼬츤", "k̕otʃʰɯn"],
    ["잎이", "이피", "ipʰi"],
    ["같아요", "가타요", "katʰayo"],
    ["갔어", "가써", "kas̕ʌ"],
    ["낮에", "나제", "nadʒe"],
    ["빛이", "비치", "pitʃʰi"],
    ["부엌에", "부어케", "puʌkʰe"],
    ["낚아요", "나까요", "nak̕ayo"],
    ["꽃이", "꼬치", "k̕otʃʰi"]
]

In [22]:
validate_model(resyllabification)

   IPA                Spelling        Pronunciation   Predicted Pronunciation
☒ hangɯrɯn             한글은          한그른           한글ᅳᅳᆫ
☑ ilgʌyo               읽어요          일거요           일거요
☑ tʃʰɛgɯl              책을            채글             채글
☒ aras̕ʌyo             알았어요        아라써요         아라어요
☑ tʃilmuɲi             질문이          질무니           질무니
☒ is̕ʌyo               있어요          이써요           읻ᅥ요

읽어 was in the training set. skipping.

☑ tɯrɯsʰeyo            들으세요        드르세요         드르세요
☒ madʒas̕ʌyo           맞았어요        마자써요         마자꺼ᄋ
☑ andʒɯsʰeyo           앉으세요        안즈세요         안즈세요
☒ tʃʰʌnmaneyo          천만에요        천마네요         천다ᄂᄋ요

책이 was in the training set. skipping.

☒ ʌp̚s̕ʌyo             없어요          업써요           업:ᅥᄋ요
☒ pɛkʰwadʒʌme          백화점에        배콰저메         배콰점ᅦ
☒ kas̕ʌyo              갔어요          가써요           가더요
☒ osʰɯl                옷을            오슬             온ᅳᆯ
☒ padas̕ʌyo            받았어요        바다써요         바다어요
☑ k̕otʃʰɯn             꽃은      

In [23]:
# syllable-final closure (unrelease)

"At the end of a word or before a consonant, all Korean consonants are pronounced with closure of the speech organs involved, that is, without releasing air. As a result, sound changes occur in consonants in word-final or pre-consonantal position. For example, 꽃은 'as for flowers' is pronounced without any change in ㅊ because the word 꽃 'flower' is immediately followed by the vowel-initial particle 은 'as for'. However, 꽃 'flower' and 꽃도 'flower also' are pronounced 꼳 and 꼳또 respectively. The change of ㅊ to ㄷ here happens because the speech organs (the tongue and the hard palate) responsible for the articulation of the word-final and pre-consonantal ㅊ are not released." 

Lips: ㅂ, ㅍ ==> ㅂ

Gum ridge and hard palate: ㄷ, ㅌ, ㅅ, ㅆ, ㅈ, ㅊ ==> ㄷ

Soft palate: ㄱ, ㅋ, ㄲ ==> ㄱ

Cho et al. (2009). Integrated Korean

In [24]:
unrelease = [
    ["꽃", "꼳", "k̕ot̚"],
    ["꽃도", "꼳또", "k̕ot̚t̕o"],              # tensification
    ["잎", "입", "ip̚"],
    ["잎과", "입꽈", "ip̚k̕wa"],              # tensification
    ["옷", "옫", "ot̚"],
    ["옷도", "옫또", "ot̚t̕o"],               # tensification
    ["갔다", "갇따", "kat̚t̕a"],              # tensification
    ["갔지", "갇찌", "kat̚tʃ̕i"],             # tensification
    ["밤낮", "밤낟", "pamnat̚"],
    ["낮과", "낟꽈", "nat̚k̕wa"],             # tensification
    ["빛", "빋", "pit̚"],
    ["빛조차", "빋쪼차", "pit̚tʃ̕otʃʰa"],      # tensification
    ["부엌", "부억", "puʌk̚"],
    ["부엌바닥", "부억빠닥", "puʌk̚ p̕adak̚"],  # tensification
    ["낚시", "낙씨", "nak̚ɕ̕i"],
    ["낚다가", "낙따가", "nak̚t̕aga"],        # tensification
    ["꽃씨", "꼳씨", "k̕ot̚ɕ̕i"]              # tensification
]

In [25]:
validate_model(unrelease)

   IPA                Spelling        Pronunciation   Predicted Pronunciation

꽃 was in the training set. skipping.


꽃도 was in the training set. skipping.


잎 was in the training set. skipping.

☑ ip̚k̕wa              잎과            입꽈             입꽈

옷 was in the training set. skipping.


옷도 was in the training set. skipping.

☑ kat̚t̕a              갔다            갇따             갇따
☑ kat̚tʃ̕i             갔지            갇찌             갇찌

밤낮 was in the training set. skipping.

☑ nat̚k̕wa             낮과            낟꽈             낟꽈

빛 was in the training set. skipping.

☒ pit̚tʃ̕otʃʰa         빛조차          빋쪼차           빋쪼자

부엌 was in the training set. skipping.

☑ puʌk̚ p̕adak̚        부엌바닥        부억빠닥         부억빠닥

낚시 was in the training set. skipping.

☑ nak̚t̕aga            낚다가          낙따가           낙따가

꽃씨 was in the training set. skipping.


Perfect Accuracy:            85.71%
Syllable Accuracy:           94.44%
Segment (Letter) Accuracy:   XX.XX%


In [26]:
# nasal assimilation

"All plosive and fricative consonants become the corresponding nasal consonants before a nasal consonant (ㅁ, ㄴ). Notice that even ㅎ is included in the change."

ㅂ, ㅍ ==> ㅁ

ㄷ, ㅌ, ㅅ, ㅆ, ㅈ, ㅊ, ㅎ ==> ㄴ

ㄱ, ㅋ, ㄲ ==> ㅇ

Cho et al. (2009). Integrated Korean

In [27]:
nasal_assimilation = [
    ["입만", "임만", "imman"],
    ["앞문", "암문", "ammun"],
    ["없나요", "엄나요", "ʌmnayo"],
    ["받는다", "반는다", "pannɯnda"],
    ["끝나다", "끈나다", "k̕ɯnnada"],
    ["몇년", "면년", "myʌn ɲyʌn"],
    ["있는데", "인는데", "innɯnde"],
    ["일학년", "일항년", "il haŋɲyʌn"],
    ["낳는다", "난는다", "nannɯnda"],
    ["모르겠습니다", "모르겓씀니다", "morɯget̚s̕ɯmɲida"],  # unrelease, tensification
    ["한국말로", "한궁말로", "hanguŋmallo"],
    ["합니까", "함니까", "hamɲik̕a"],
    ["끝내겠습니다", "끈내겓씀니다", "k̕ɯnnɛget̚s̕ɯmɲida"]  # unrelease, tensification
]

In [28]:
validate_model(nasal_assimilation)

   IPA                Spelling        Pronunciation   Predicted Pronunciation

입만 was in the training set. skipping.


앞문 was in the training set. skipping.

☒ ʌmnayo               없나요          엄나요           엄:ᅡ아
☒ pannɯnda             받는다          반는다           반든다

끝나다 was in the training set. skipping.

☑ myʌn ɲyʌn            몇년            면년             면년
☑ innɯnde              있는데          인는데           인는데
☑ il haŋɲyʌn           일학년          일항년           일항년
☑ nannɯnda             낳는다          난는다           난는다
☒ morɯget̚s̕ɯmɲida     모르겠습니다     모르겓씀니다     모르겓쓰니다
☒ hanguŋmallo          한국말로        한궁말로         한:궁망로
☑ hamɲik̕a             합니까          함니까           함니까
☒ k̕ɯnnɛget̚s̕ɯmɲida   끝내겠습니다     끈내겓씀니다     끔내걷씀니다

Perfect Accuracy:            50.00%
Syllable Accuracy:           75.00%
Segment (Letter) Accuracy:   XX.XX%


In [29]:
# ㄴ to ㄹ assimilation

"When ㄹ and ㄴ come together, the ㄴ sound is usually replaced by the ㄹ sound, as in 칠 년: '칠련'. When ㄹ is followed by the vowel 'i' ('이') or the semivowel 'y' (e.g., "야") in some compound words, another ㄹ is inserted between them, as in 물약: '물략'." Cho et al. (2009). Integrated Korean



In [30]:
rieul_assimilation = [
    ["칠년", "칠련", "tʃil  lyʌn"],
    ["물약", "물략", "mullyak̚"],      # unrelease
    ["진리", "질리", "tʃilli"],
    ["신라", "실라", "ɕilla"],
    ["전라도", "절라도", "tʃʌllado"],
    ["달님", "달림", "tallim"],
    ["팔년", "팔련", "pʰal lyʌn"],
    ["서울역", "서울력", "sʰʌullyʌk̚"], # unrelease
    ["길이름", "길리름", "killirɯm"]
]

In [31]:
validate_model(rieul_assimilation)

   IPA                Spelling        Pronunciation   Predicted Pronunciation
☑ tʃil  lyʌn           칠년            칠련             칠련

물약 was in the training set. skipping.


진리 was in the training set. skipping.


신라 was in the training set. skipping.


전라도 was in the training set. skipping.


달님 was in the training set. skipping.

☑ pʰal lyʌn            팔년            팔련             팔련

서울역 was in the training set. skipping.

☒ killirɯm             길이름          길리름           기리름

Perfect Accuracy:            66.67%
Syllable Accuracy:           85.71%
Segment (Letter) Accuracy:   XX.XX%


In [32]:
# tensification

"When a plain plosive consonant (ㅂ, ㄷ, ㅈ, ㄱ) or the fricative consonant ㅅ is preceded by a plosive or fricative consonant, it is reinforced to become a corresponding tense cosonant, as in 몇번 \[멷뻔\] (careful speech) or \[며뻔\] (casual speech), 학생 \[학쌩\] and 없다 \[업따\].... Tensification also occurs in compound nouns." Cho et al. (2009). Integrated Korean