In [1]:
import pandas as pd
import numpy as np
from jamotools import split_syllables, Vectorizationer, join_jamos, rules
from unicodedata import normalize

In [2]:
data = pd.read_csv("korean_phonetic_spelling.csv", sep="\t")
print(len(data))
data.head()

10120


Unnamed: 0,spelling,pronunciation
0,가게,가ː게
1,가격,가격
2,가구,가구
3,가까이,가까이
4,가깝다,가깝따


In [3]:
# test jamotools

In [4]:
store = data.iloc[0]["spelling"]
store

'가게'

In [5]:
split_syllables(store)

'ㄱㅏㄱㅔ'

In [6]:
# test with string with punctuation

In [7]:
store_pron = data.iloc[0]["pronunciation"]
store_pron

'가ː게'

In [8]:
split_syllables(store_pron)

'ㄱㅏːㄱㅔ'

In [9]:
# add these representations to the DataFrame

In [10]:
data["split_spelling"] = data.spelling.apply(split_syllables)

In [11]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling
0,가게,가ː게,ㄱㅏㄱㅔ
1,가격,가격,ㄱㅏㄱㅕㄱ
2,가구,가구,ㄱㅏㄱㅜ
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ


In [12]:
data["split_pronunciation"] = data.pronunciation.apply(split_syllables)

In [13]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ


In [14]:
# find the longest sequence for each column

In [15]:
data.split_spelling.str.len().max()

16

In [16]:
data.split_pronunciation.str.len().max()

17

In [17]:
data[data.split_spelling.str.len() == 16]

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation
3004,불만족스럽다,불만족쓰럽따,ㅂㅜㄹㅁㅏㄴㅈㅗㄱㅅㅡㄹㅓㅂㄷㅏ,ㅂㅜㄹㅁㅏㄴㅈㅗㄱㅆㅡㄹㅓㅂㄸㅏ
6087,들락날락하다,들랑날라카다,ㄷㅡㄹㄹㅏㄱㄴㅏㄹㄹㅏㄱㅎㅏㄷㅏ,ㄷㅡㄹㄹㅏㅇㄴㅏㄹㄹㅏㅋㅏㄷㅏ
6227,먹음직스럽다,머금직쓰럽따,ㅁㅓㄱㅇㅡㅁㅈㅣㄱㅅㅡㄹㅓㅂㄷㅏ,ㅁㅓㄱㅡㅁㅈㅣㄱㅆㅡㄹㅓㅂㄸㅏ
6402,믿음직스럽다,미듬직쓰럽따,ㅁㅣㄷㅇㅡㅁㅈㅣㄱㅅㅡㄹㅓㅂㄷㅏ,ㅁㅣㄷㅡㅁㅈㅣㄱㅆㅡㄹㅓㅂㄸㅏ
7659,울퉁불퉁하다,울퉁불퉁하다,ㅇㅜㄹㅌㅜㅇㅂㅜㄹㅌㅜㅇㅎㅏㄷㅏ,ㅇㅜㄹㅌㅜㅇㅂㅜㄹㅌㅜㅇㅎㅏㄷㅏ
8170,정정당당하다,정ː정당당하다,ㅈㅓㅇㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ,ㅈㅓㅇːㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ


In [18]:
data[data.split_pronunciation.str.len() == 17]

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation
8170,정정당당하다,정ː정당당하다,ㅈㅓㅇㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ,ㅈㅓㅇːㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ


In [19]:
# test Vectorizationer

In [20]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [21]:
vec.vectorize(store)

array([ 2, 21,  2, 26], dtype=uint8)

In [22]:
vec.vectorize(store_pron)

array([ 2, 21,  1,  2, 26], dtype=uint8)

In [23]:
# check out how punctuation and whitespace is handled

In [24]:
vec.vectorize(' .,/()"*:-%')

array([105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], dtype=uint8)

In [25]:
vec.vectorize("ː")

array([1], dtype=uint8)

In [26]:
# vectorizer appears to work, so perhaps I can use this instead of split_syllables()

In [27]:
data["vec_spelling"] = data.spelling.apply(vec.vectorize)

In [28]:
data["vec_pronunciation"] = data.pronunciation.apply(vec.vectorize)

In [29]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ,"[2, 21, 2, 26]","[2, 21, 1, 2, 26]"
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]"
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]"
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]"
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]"


In [30]:
# and while there appears to be no way to convert a vector back to text, split_syllables() just produces a string
# which join_jamos() is able to reconstruct so I can probably just reverse the symbol_map contained in a Vectorizer
# object

In [31]:
store

'가게'

In [32]:
split = split_syllables(store)
split

'ㄱㅏㄱㅔ'

In [33]:
type(split)

str

In [34]:
join_jamos(split)

'가게'

In [35]:
dir(vec)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_max_length',
 '_pad_index',
 '_prefix_padding_size',
 '_rule',
 '_symbol_map',
 '_symbols',
 'max_length',
 'pad_index',
 'symbol_map',
 'symbols',
 'vectorize']

In [36]:
vec.symbol_map

{'<PAD>': 0,
 '<UNK>': 1,
 'ᄀ': 2,
 'ᄁ': 3,
 'ᄂ': 4,
 'ᄃ': 5,
 'ᄄ': 6,
 'ᄅ': 7,
 'ᄆ': 8,
 'ᄇ': 9,
 'ᄈ': 10,
 'ᄉ': 11,
 'ᄊ': 12,
 'ᄋ': 13,
 'ᄌ': 14,
 'ᄍ': 15,
 'ᄎ': 16,
 'ᄏ': 17,
 'ᄐ': 18,
 'ᄑ': 19,
 'ᄒ': 20,
 'ᅡ': 21,
 'ᅢ': 22,
 'ᅣ': 23,
 'ᅤ': 24,
 'ᅥ': 25,
 'ᅦ': 26,
 'ᅧ': 27,
 'ᅨ': 28,
 'ᅩ': 29,
 'ᅪ': 30,
 'ᅫ': 31,
 'ᅬ': 32,
 'ᅭ': 33,
 'ᅮ': 34,
 'ᅯ': 35,
 'ᅰ': 36,
 'ᅱ': 37,
 'ᅲ': 38,
 'ᅳ': 39,
 'ᅴ': 40,
 'ᅵ': 41,
 'ᆨ': 42,
 'ᆩ': 43,
 'ᆪ': 44,
 'ᆫ': 45,
 'ᆬ': 46,
 'ᆭ': 47,
 'ᆮ': 48,
 'ᆯ': 49,
 'ᆰ': 50,
 'ᆱ': 51,
 'ᆲ': 52,
 'ᆳ': 53,
 'ᆴ': 54,
 'ᆵ': 55,
 'ᆶ': 56,
 'ᆷ': 57,
 'ᆸ': 58,
 'ᆹ': 59,
 'ᆺ': 60,
 'ᆻ': 61,
 'ᆼ': 62,
 'ᆽ': 63,
 'ᆾ': 64,
 'ᆿ': 65,
 'ᇀ': 66,
 'ᇁ': 67,
 'ᇂ': 68,
 'a': 69,
 'b': 70,
 'c': 71,
 'd': 72,
 'e': 73,
 'f': 74,
 'g': 75,
 'h': 76,
 'i': 77,
 'j': 78,
 'k': 79,
 'l': 80,
 'm': 81,
 'n': 82,
 'o': 83,
 'p': 84,
 'q': 85,
 'r': 86,
 's': 87,
 't': 88,
 'u': 89,
 'v': 90,
 'w': 91,
 'x': 92,
 'y': 93,
 'z': 94,
 '0': 95,
 '1': 96,
 '2': 97,
 '3': 98,
 '4': 99,
 '

In [37]:
decoder = {v: k for k, v in vec.symbol_map.items()}

In [38]:
decoder

{0: '<PAD>',
 1: '<UNK>',
 2: 'ᄀ',
 3: 'ᄁ',
 4: 'ᄂ',
 5: 'ᄃ',
 6: 'ᄄ',
 7: 'ᄅ',
 8: 'ᄆ',
 9: 'ᄇ',
 10: 'ᄈ',
 11: 'ᄉ',
 12: 'ᄊ',
 13: 'ᄋ',
 14: 'ᄌ',
 15: 'ᄍ',
 16: 'ᄎ',
 17: 'ᄏ',
 18: 'ᄐ',
 19: 'ᄑ',
 20: 'ᄒ',
 21: 'ᅡ',
 22: 'ᅢ',
 23: 'ᅣ',
 24: 'ᅤ',
 25: 'ᅥ',
 26: 'ᅦ',
 27: 'ᅧ',
 28: 'ᅨ',
 29: 'ᅩ',
 30: 'ᅪ',
 31: 'ᅫ',
 32: 'ᅬ',
 33: 'ᅭ',
 34: 'ᅮ',
 35: 'ᅯ',
 36: 'ᅰ',
 37: 'ᅱ',
 38: 'ᅲ',
 39: 'ᅳ',
 40: 'ᅴ',
 41: 'ᅵ',
 42: 'ᆨ',
 43: 'ᆩ',
 44: 'ᆪ',
 45: 'ᆫ',
 46: 'ᆬ',
 47: 'ᆭ',
 48: 'ᆮ',
 49: 'ᆯ',
 50: 'ᆰ',
 51: 'ᆱ',
 52: 'ᆲ',
 53: 'ᆳ',
 54: 'ᆴ',
 55: 'ᆵ',
 56: 'ᆶ',
 57: 'ᆷ',
 58: 'ᆸ',
 59: 'ᆹ',
 60: 'ᆺ',
 61: 'ᆻ',
 62: 'ᆼ',
 63: 'ᆽ',
 64: 'ᆾ',
 65: 'ᆿ',
 66: 'ᇀ',
 67: 'ᇁ',
 68: 'ᇂ',
 69: 'a',
 70: 'b',
 71: 'c',
 72: 'd',
 73: 'e',
 74: 'f',
 75: 'g',
 76: 'h',
 77: 'i',
 78: 'j',
 79: 'k',
 80: 'l',
 81: 'm',
 82: 'n',
 83: 'o',
 84: 'p',
 85: 'q',
 86: 'r',
 87: 's',
 88: 't',
 89: 'u',
 90: 'v',
 91: 'w',
 92: 'x',
 93: 'y',
 94: 'z',
 95: '0',
 96: '1',
 97: '2',
 98: '3',
 99: '4',
 1

In [39]:
def unvectorize(vector):
    temp = [decoder[num] for num in vector]
    return "".join(temp)

In [40]:
test_vec = vec.vectorize(store)
test_vec

array([ 2, 21,  2, 26], dtype=uint8)

In [41]:
unvectorize(test_vec)

'가게'

In [42]:
# test on the DataFrame

In [43]:
data["unvec_spelling"] = data.vec_spelling.apply(unvectorize)

In [44]:
data["unvec_pronunciation"] = data.vec_pronunciation.apply(unvectorize)

In [45]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ,"[2, 21, 2, 26]","[2, 21, 1, 2, 26]",가게,가<UNK>게
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]",가격,가격
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]",가구,가구
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]",가까이,가까이
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]",가깝다,가깝따


In [46]:
len(data[data.spelling != data.unvec_spelling])

10120

In [47]:
# columns don't appear to match for any of the rows despite visual equality

In [48]:
data.iloc[0].spelling

'가게'

In [49]:
data.iloc[0].unvec_spelling

'가게'

In [50]:
data.iloc[0].spelling == data.iloc[0].unvec_spelling

False

In [51]:
# there appear to be different unicode representations as these words are exactly the same

In [52]:
"가게" == "가게"

False

In [53]:
# confirm this via ord() which returns an integer representing the unicode code point

In [54]:
[ord(x) for x in data.iloc[0].spelling]

[44032, 44172]

In [55]:
[ord(x) for x in data.iloc[0].unvec_spelling]

[4352, 4449, 4352, 4454]

In [56]:
# unicodedata is a Python module. its normalize() method can handle situations such as these by converting
# code points to normal form.
# "NFD", normal form D, is also known as canonical decomposition and translates each character into its decomposed form.
# "NFC" first applies the canonical decomposition, then composes the pre-composed characters again

In [57]:
normalize("NFD", data.iloc[0].spelling) == normalize("NFD", data.iloc[0].unvec_spelling)

True

In [58]:
# unvec_spelling originally contained strings in "NFD"

In [59]:
[ord(x) for x in normalize("NFD", data.iloc[0].spelling)]

[4352, 4449, 4352, 4454]

In [60]:
# spelling originally contained strings in "NFC"

In [61]:
[ord(x) for x in normalize("NFC", data.iloc[0].spelling)]

[44032, 44172]

In [62]:
# alter the function to produces values in line with the original data

In [63]:
def unvectorize_norm(vector):
    temp_list = [decoder[num] for num in vector]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [64]:
unvectorize_norm(test_vec)

'가게'

In [65]:
[ord(x) for x in normalize("NFC", unvectorize_norm(test_vec))]

[44032, 44172]

In [66]:
# re-test unvectorization, this time with unvectorize_norm

In [67]:
data = data.drop(["unvec_spelling", "unvec_pronunciation"], axis=1)
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ,"[2, 21, 2, 26]","[2, 21, 1, 2, 26]"
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]"
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]"
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]"
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]"


In [68]:
data["unvec_spelling"] = data.vec_spelling.apply(unvectorize_norm)

In [69]:
data["unvec_pronunciation"] = data.vec_pronunciation.apply(unvectorize_norm)

In [70]:
len(data[data.spelling != data.unvec_spelling])

0

In [71]:
data

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ,"[2, 21, 2, 26]","[2, 21, 1, 2, 26]",가게,가<UNK>게
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]",가격,가격
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]",가구,가구
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]",가까이,가까이
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]",가깝다,가깝따
...,...,...,...,...,...,...,...,...
10115,횡단,휑단,ㅎㅚㅇㄷㅏㄴ,ㅎㅞㅇㄷㅏㄴ,"[20, 32, 62, 5, 21, 45]","[20, 36, 62, 5, 21, 45]",횡단,휑단
10116,횡설수설,횡설수설,ㅎㅚㅇㅅㅓㄹㅅㅜㅅㅓㄹ,ㅎㅚㅇㅅㅓㄹㅅㅜㅅㅓㄹ,"[20, 32, 62, 11, 25, 49, 11, 34, 11, 25, 49]","[20, 32, 62, 11, 25, 49, 11, 34, 11, 25, 49]",횡설수설,횡설수설
10117,횡설수설,휑설수설,ㅎㅚㅇㅅㅓㄹㅅㅜㅅㅓㄹ,ㅎㅞㅇㅅㅓㄹㅅㅜㅅㅓㄹ,"[20, 32, 62, 11, 25, 49, 11, 34, 11, 25, 49]","[20, 36, 62, 11, 25, 49, 11, 34, 11, 25, 49]",횡설수설,휑설수설
10118,후퇴,후ː퇴,ㅎㅜㅌㅚ,ㅎㅜːㅌㅚ,"[20, 34, 18, 32]","[20, 34, 1, 18, 32]",후퇴,후<UNK>퇴


In [72]:
# <UNK> appears to be the only issue

In [73]:
len(data[data.pronunciation != data.unvec_pronunciation])

2842

In [74]:
len(data[data.unvec_pronunciation.str.contains("<UNK>")])

2842

In [75]:
data[data.unvec_pronunciation.str.contains("<UNK>")].head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
0,가게,가ː게,ㄱㅏㄱㅔ,ㄱㅏːㄱㅔ,"[2, 21, 2, 26]","[2, 21, 1, 2, 26]",가게,가<UNK>게
28,간식,간ː식,ㄱㅏㄴㅅㅣㄱ,ㄱㅏㄴːㅅㅣㄱ,"[2, 21, 45, 11, 41, 42]","[2, 21, 45, 1, 11, 41, 42]",간식,간<UNK>식
36,감,감ː,ㄱㅏㅁ,ㄱㅏㅁː,"[2, 21, 57]","[2, 21, 57, 1]",감,감<UNK>
37,감기,감ː기,ㄱㅏㅁㄱㅣ,ㄱㅏㅁːㄱㅣ,"[2, 21, 57, 2, 41]","[2, 21, 57, 1, 2, 41]",감기,감<UNK>기
38,감기약,감ː기약,ㄱㅏㅁㄱㅣㅇㅑㄱ,ㄱㅏㅁːㄱㅣㅇㅑㄱ,"[2, 21, 57, 2, 41, 13, 23, 42]","[2, 21, 57, 1, 2, 41, 13, 23, 42]",감기약,감<UNK>기약


In [76]:
# this punctuation isn't a colon but actually two triangles facing each other creating the shape of an hourglass.
# (see below for a visual; the code point is U+2d0)
# the IPA (international phonetic alphabet) uses this symbol to express length (how long a sound is pronounced)

https://en.wikipedia.org/wiki/Length_(phonetics)

In [77]:
hex(ord("ː"))

'0x2d0'

In [78]:
%%html 

<h1>ː</h1>

In [79]:
# convert this to a common colon so Vectorizationer() can handle it properly

In [80]:
data.pronunciation = data.pronunciation.str.replace("ː", ":")
data.split_pronunciation = data.split_pronunciation.str.replace("ː", ":")

In [81]:
# vectorize the pronunciation column again

In [82]:
data["vec_pronunciation"] = data.pronunciation.apply(vec.vectorize)

In [83]:
# unvectorize pronunciation and compare it to the pronunciation column

In [84]:
data["unvec_pronunciation"] = data.vec_pronunciation.apply(unvectorize_norm)

In [85]:
len(data[data.pronunciation != data.unvec_pronunciation])

0

In [86]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
0,가게,가:게,ㄱㅏㄱㅔ,ㄱㅏ:ㄱㅔ,"[2, 21, 2, 26]","[2, 21, 113, 2, 26]",가게,가:게
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]",가격,가격
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]",가구,가구
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]",가까이,가까이
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]",가깝다,가깝따


In [87]:
# this methods appears to be successful for converting back and forth between Korean and the vectors

In [88]:
# NOTE: 113 is the colon in the reversed symbol_map dictionary

In [89]:
decoder[113]

':'

In [90]:
# to make sure the dimensions of each row are the same, pad the end of each vector with zeros until it is as long as
# the longest vector. this was calculated earlier but here again for convenience

In [91]:
data.split_spelling.str.len().max()

16

In [92]:
data.split_pronunciation.str.len().max()

17

In [93]:
# so pad each vector in vec_spelling with zeros until it reaches 16 digits, and the same for vec_pronunciation except
# until 17

In [94]:
def pad_with_zeros(vector, num_zeros):
    zeros_to_pad = num_zeros - len(vector)
    # default value for "constant" mode is zero.
    # pad the array with 0 zeros on the left and "zeros_to_pad" zeros on the right
    return np.pad(vector, (0, zeros_to_pad), "constant")

In [95]:
# test

In [96]:
pad_with_zeros(data.iloc[0].vec_spelling, 16)

array([ 2, 21,  2, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=uint8)

In [97]:
len(pad_with_zeros(data.iloc[0].vec_spelling, 16))

16

In [98]:
# apply
# NOTE: passing args to apply required a comma after the integer

In [99]:
data["vec_spelling_pad"] = data.vec_spelling.apply(pad_with_zeros, args=(16,))

In [100]:
data["vec_pronunciation_pad"] = data.vec_pronunciation.apply(pad_with_zeros, args=(17,))

In [101]:
data.head()

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation,vec_spelling_pad,vec_pronunciation_pad
0,가게,가:게,ㄱㅏㄱㅔ,ㄱㅏ:ㄱㅔ,"[2, 21, 2, 26]","[2, 21, 113, 2, 26]",가게,가:게,"[2, 21, 2, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[2, 21, 113, 2, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,가격,가격,ㄱㅏㄱㅕㄱ,ㄱㅏㄱㅕㄱ,"[2, 21, 2, 27, 42]","[2, 21, 2, 27, 42]",가격,가격,"[2, 21, 2, 27, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 21, 2, 27, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,가구,가구,ㄱㅏㄱㅜ,ㄱㅏㄱㅜ,"[2, 21, 2, 34]","[2, 21, 2, 34]",가구,가구,"[2, 21, 2, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[2, 21, 2, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,가까이,가까이,ㄱㅏㄲㅏㅇㅣ,ㄱㅏㄲㅏㅇㅣ,"[2, 21, 3, 21, 13, 41]","[2, 21, 3, 21, 13, 41]",가까이,가까이,"[2, 21, 3, 21, 13, 41, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 21, 3, 21, 13, 41, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,가깝다,가깝따,ㄱㅏㄲㅏㅂㄷㅏ,ㄱㅏㄲㅏㅂㄸㅏ,"[2, 21, 3, 21, 58, 5, 21]","[2, 21, 3, 21, 58, 6, 21]",가깝다,가깝따,"[2, 21, 3, 21, 58, 5, 21, 0, 0, 0, 0, 0, 0, 0,...","[2, 21, 3, 21, 58, 6, 21, 0, 0, 0, 0, 0, 0, 0,..."


In [102]:
# convert these padded vectors to DataFrames

In [103]:
vec_spelling_df = pd.DataFrame(data.vec_spelling_pad.to_list())
vec_spelling_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2,21,2,26,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,5,21,0,0,0,0,0,0,0,0,0


In [104]:
vec_pronunciation_df = pd.DataFrame(data.vec_pronunciation_pad.to_list())
vec_pronunciation_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,21,113,2,26,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,6,21,0,0,0,0,0,0,0,0,0,0


In [105]:
# do some quick confirmation of process. 8170 was the subscript of the longest pronunciation (checked above), so
# confirm that that's true here

In [106]:
data[data.split_pronunciation.str.len() == 17]

Unnamed: 0,spelling,pronunciation,split_spelling,split_pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation,vec_spelling_pad,vec_pronunciation_pad
8170,정정당당하다,정:정당당하다,ㅈㅓㅇㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ,ㅈㅓㅇ:ㅈㅓㅇㄷㅏㅇㄷㅏㅇㅎㅏㄷㅏ,"[14, 25, 62, 14, 25, 62, 5, 21, 62, 5, 21, 62,...","[14, 25, 62, 113, 14, 25, 62, 5, 21, 62, 5, 21...",정정당당하다,정:정당당하다,"[14, 25, 62, 14, 25, 62, 5, 21, 62, 5, 21, 62,...","[14, 25, 62, 113, 14, 25, 62, 5, 21, 62, 5, 21..."


In [107]:
vec_pronunciation_df.iloc[8170]

0      14
1      25
2      62
3     113
4      14
5      25
6      62
7       5
8      21
9      62
10      5
11     21
12     62
13     20
14     21
15      5
16     21
Name: 8170, dtype: int64

In [108]:
unvectorize_norm(vec_pronunciation_df.iloc[8170].to_list())

'정:정당당하다'

In [109]:
# write to file

In [110]:
vec_spelling_df.to_csv("vec_spelling.csv", index=False, sep="\t")

In [111]:
vec_pronunciation_df.to_csv("vec_pronunciation.csv", index=False, sep="\t")

In [112]:
# confirm content written correctly

In [113]:
confirm = pd.read_csv("vec_spelling.csv", sep="\t")
confirm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2,21,2,26,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,5,21,0,0,0,0,0,0,0,0,0


In [114]:
len(confirm)

10120

In [115]:
confirm = pd.read_csv("vec_pronunciation.csv", sep="\t")
confirm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,21,113,2,26,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,6,21,0,0,0,0,0,0,0,0,0,0


In [116]:
len(confirm)

10120