In [1]:
import numpy as np
from keras.models import load_model
from jamotools import Vectorizationer, rules
from unicodedata import normalize

In [2]:
# validate the model on various phonological phenomena

In [3]:
# first load the model
# NOTE: load_model() loads a fully compiled model identical to the one saved using .save()

In [4]:
model = load_model("pronunciation_prediction.h5")

In [5]:
# instantiate the vectorizer for decoding predictions

In [6]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [7]:
# define a function for unvectorizing predictions

In [8]:
decoder = {v: k for k, v in vec.symbol_map.items()}

def unvectorize_norm_pad(vector):
    temp_list = [decoder[num] for num in vector if num != 0]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [9]:
# define a function for comparing items

In [10]:
longest_string = 23

def validate_model(phenomena):
    
    print("{:15} {:15} {:15}".format("Spelling", "Pronunciation", "Predicted Pronunciation"))
    
    correct = 0
    total = len(phenomena)
    
    for instance in phenomena:
        spelling = instance[0]
        pronunciation = instance[1]
        
        spell_vec = vec.vectorize(spelling)
        
        zeros_to_pad = longest_string - len(spell_vec)
        
        spell_padded = np.pad(spell_vec, (0, zeros_to_pad), "constant")

        spell_padded = spell_padded.reshape(1, spell_padded.shape[0])
        
        prediction = model.predict(spell_padded)[0].argmax(axis=1)
        
        predicted_pronunciation = unvectorize_norm_pad(prediction)
        
        if pronunciation == predicted_pronunciation:
            correct += 1
        
        justify = 16 + len(predicted_pronunciation) - len(pronunciation)*2
        print("{:10} {:>5} {}".format(spelling, pronunciation, predicted_pronunciation.rjust(justify)))
        
    print()
    print("Accuracy: {:.2f}%".format(100 * correct/total))

"When a syllable-final consonant is followed without pause by a vowel in the following syllable, that consonant is carried over to the following syllable to function as its initial consonant in pronunciation. The following syllable may be a part of a suffix or another word. This linking of syllable-final consonant to following syllable in pronunciation is...resyllabification." Cho et al. (2009). Integrated Korean

In [11]:
resyllabification = [
    ["책을", "채글"],
    ["질문이", "질무니"],
    ["있어요", "이써요"],
    ["앉으세요", "안즈세요"],
    ["책이", "채기"],
    ["없어요", "업서요"],
    ["백화점에", "백화저메"],
    ["갔어요", "가써요"],
    ["알았어요", "아라써요"],
    ["읽어", "일거"],
    ["맞았어요", "마자써요"],
    ["천만에요", "천만네요"],
    ["옷을", "오슬"],
    ["받았어요", "바다써요"]
]

In [12]:
validate_model(resyllabification)

Spelling        Pronunciation   Predicted Pronunciation
책을            채글             채글
질문이          질무니           질무니
있어요          이써요           읻ᅥ요
앉으세요        안즈세요         안즈세요
책이            채기             채기
없어요          업서요           업:ᅥᄋ요
백화점에        백화저메         배콰점ᅦ
갔어요          가써요           가더요
알았어요        아라써요         아라어요
읽어            일거             일거
맞았어요        마자써요         마자꺼ᄋ
천만에요        천만네요         천다ᄂᄋ요
옷을            오슬             온ᅳᆯ
받았어요        바다써요         바다어요

Accuracy: 35.71%


-------------------------------------------------------------------------------------------------------------------------------

"At the end of a word or before a consonant, all Korean consonants are pronounced with closure of the speech organs involved, that is, without releasing air. As a result, sound changes occur in consonants in word-final or pre-consonantal position. For example, 꽃은 'as for flowers' is pronounced without any change in ㅊ because the word 꽃 'flower' is immediately followed by the vowel-initial particle 은 'as for'. However, 꽃 'flower' and 꽃도 'flower also' are pronounced 꼳 and 꼳또 respectively. The change of ㅊ to ㄷ here happens because the speech organs (the tongue and the hard palate) responsible for the articulation of the word-final and pre-consonantal ㅊ are not released." 

Lips: ㅂ, ㅍ --> ㅂ

Gum ridge and hard palate: ㄷ, ㅌ, ㅅ, ㅆ, ㅈ, ㅊ --> ㄷ

Soft palate: ㄱ, ㅋ, ㄲ --> ㄱ

Cho et al. (2009). Integrated Korean

In [13]:
unrelease = [
    ["잎과", "입꽈"],
    ["옷도", "옫또"],
    ["갔지", "갇찌"],
    ["낮과", "낟꽈"],
    ["빛조차", "빋쪼차"],
    ["부엌바닥", "부억빠닥"],
    ["낚다가", "낙따가"],
    ["꽃씨", "꼳씨"]
]

In [14]:
validate_model(unrelease)

Spelling        Pronunciation   Predicted Pronunciation
잎과            입꽈             입꽈
옷도            옫또             옫또
갔지            갇찌             갇찌
낮과            낟꽈             낟꽈
빛조차          빋쪼차           빋쪼자
부엌바닥        부억빠닥         부억빠닥
낚다가          낙따가           낙따가
꽃씨            꼳씨             꼳씨

Accuracy: 87.50%
