Data are found in ```curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2607{/cs.zip,/es.zip,/fr.zip,/ga.zip,/hr.zip,/hu.zip,/lv.zip,/pl.zip,/ro.zip,/sk.zip,/tr.zip,/vi.zip,/stripping_diacritics.zip}```

Data also contain some mixed and borrowed words. Some put diacritics on them to make those phonetic, but not always. Test data also contain some other languages (Mandarin and English) perhaps to make sure the system is smart enough to not restore diacritics for these

In [None]:
#!curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2607{/vi.zip}
#!unzip vi.zip
#!cd vi && xz -v -d *

In [1]:
import unicodedata
import re
from collections import defaultdict
import pickle
import numpy as np

def remove_accents(input_str): #this removes some special characters which we dont like
    input_str = re.sub(r'đ', 'd', input_str)
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    ascii_form = nfkd_form.encode('ascii','ignore')
    text = ascii_form.decode("utf-8")
    return str(text)

In [2]:
charToDiacritized = defaultdict(lambda: defaultdict(list))
charToDiacritized["a"][0] = ["a", "à", "á", "ả", "ã", "ạ"]
charToDiacritized["a"][1] = ["ă", "ằ", "ắ", "ẳ", "ẵ", "ặ"]
charToDiacritized["a"][2] = ["â", "ầ", "ấ", "ẩ", "ẫ", "ậ"]
charToDiacritized["e"][0] = ["e", "è", "é", "ẻ", "ẽ", "ẹ"]
charToDiacritized["e"][1] = ["ê", "ề", "ế", "ể", "ễ", "ệ"]
charToDiacritized["i"][0] = ["i", "ì", "í", "ỉ", "ĩ", "ị"]
charToDiacritized["o"][0] = ["o", "ò", "ó", "ỏ", "õ", "ọ"]
charToDiacritized["o"][1] = ["ô", "ồ", "ố", "ổ", "ỗ", "ộ"]
charToDiacritized["o"][2] = ["ơ", "ờ", "ớ", "ở", "ỡ", "ợ"]
charToDiacritized["u"][0] = ["u", "ù", "ú", "ủ", "ũ", "ụ"]
charToDiacritized["u"][1] = ["ư", "ừ", "ứ", "ử", "ữ", "ự"]
charToDiacritized["y"][0] = ["y", "ỳ", "ý", "ỷ", "ỹ", "ỵ"]
charToDiacritized["d"][0] = ["đ"]
diacritizedToFeature = defaultdict(tuple)
for root in charToDiacritized:
    for diacr_type in charToDiacritized[root]:
        for idx, char in enumerate(charToDiacritized[root][diacr_type]):
            diacritizedToFeature[char] = (root, diacr_type, idx % 6)
            diacritizedToFeature[char.upper()] = (root.upper(), diacr_type, idx % 6)


In [3]:
def featurize_vi_diacritics(input_str: str):
    #train (char_root, diacritic_type, tone )
    # by diacritic type, i mean like non-tonal...
    # a: 0 base(a), 1 aw, 2 aa
    # e: 0 base (e), 1 ee
    # o: 0 base (o), 1 oo, 2 ow
    # u: 0 base (u), 1 uw
    # d: 0 base (d), 1 dd
    # tonal
    #0 = nothing, 1 = grave, 2 = acute, 3 = hook, 4 = tilde, 5 = dot
    input_list = []
    for i in input_str:
        if i in diacritizedToFeature: #special diacritized character
            input_list.append(diacritizedToFeature[i][1]*10 + diacritizedToFeature[i][2]) #i dont think it is necessaary to index diacritics...
        else:
            input_list.append(0)
    return input_list


def remove_vi_diacritics(input_str: str):
    input_list = []
    for i in input_str:
        if i in diacritizedToFeature:
            input_list.append(diacritizedToFeature[i][0])
        else:
            input_list.append(i)
    return input_list

In [4]:
allX = []
allY = []
vocabs = {"<S>", "</S>"}
vocabsY = set()
for lines in open("target_train.txt").readlines():
    sentX = ["<S>"] + remove_vi_diacritics(lines.strip()) + ["</S>"]
    sentY = [0] + featurize_vi_diacritics(lines.strip()) + [0]
    allX.append(sentX)
    allY.append(sentY)
    vocabsY.update(set(i for i in sentY))


In [5]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(allX, allY, test_size=0.2)

In [6]:
vocabs.update(set(i for xSent in trainX for i in xSent) )

In [7]:
print(trainX[0])
print(trainY[0])

['<S>', 'c', 'o', ' ', 't', 'h', 'e', ' ', 'n', 'o', 'i', ' ', 'n', 'h', 'u', ' ', 'v', 'a', 'y', ' ', 't', 'r', 'a', 'n', 'h', ' ', 'c', 'h', 'a', 'p', ' ', 'l', 'a', 'n', 'h', ' ', 't', 'h', 'o', ' ', 'g', 'i', 'u', 'a', ' ', 'v', 'i', 'e', 't', ' ', 'n', 'a', 'm', ' ', 'v', 'a', ' ', 't', 'r', 'u', 'n', 'g', ' ', 'q', 'u', 'o', 'c', ' ', 'd', 'a', ' ', 'a', 'm', ' ', 'i', ' ', 'd', 'i', 'e', 'n', ' ', 'r', 'a', ' ', 't', 'u', ' ', 'n', 'a', 'm', ' ', '1', '9', '7', '4', ' ', '.', '</S>']
[0, 0, 2, 0, 0, 0, 13, 0, 0, 2, 0, 0, 0, 0, 10, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 4, 0, 0, 0, 0, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 4, 0, 20, 0, 0, 3, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 11, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [8]:
len(trainX)

655934

In [9]:
def chunkify(x, sentWindow = 60):
  return [x[i:i+60] for i in range(0, len(x), sentWindow)]

In [10]:
trainX_chunked = []
for i in trainX:
  trainX_chunked += chunkify(i)


In [11]:
trainY_chunked = []
for i in trainY:
  trainY_chunked += chunkify(i)


In [12]:
vocabs = set(filter(lambda x: x.isascii() , vocabs)) #dataset has Mandarin characters, i don't think we need to. Most of them the system should learn that it is unchanged anyways...
vocabs = list(vocabs)
vocabs.append("<PAD>")
vocabs.append("<UNK>")

In [13]:
char2index = {w: i for i,w in enumerate(vocabs)}
X_train_index = [[char2index[char] if char.isascii() else char2index["<UNK>"] for char in sent] for sent in trainX_chunked]
vocabsY = list(vocabsY)
tag2index = {v:i for i,v in enumerate(vocabsY)}
Y_train_index = [[tag2index[char] for char in sent] for sent in trainY_chunked]

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
CHAR_VOCABS = len(vocabs)
CHAR_EMBEDDING = 50
CHAR_MAX_LEN = 60  # longest sentence

X_padded = pad_sequences(
    maxlen=CHAR_MAX_LEN, sequences=X_train_index, padding="post", value=char2index["<PAD>"])

In [15]:
Y_padded = pad_sequences(
    maxlen=CHAR_MAX_LEN, sequences=Y_train_index, padding="post", value=0)

In [33]:
Y_padded = np.array(Y_padded)

In [17]:
from tensorflow.keras.utils import to_categorical
Y_padded = [to_categorical(i, num_classes=len(vocabsY)) for i in Y_padded]

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed

character_model = Sequential()
character_model.add(Embedding(input_dim = CHAR_VOCABS, output_dim = CHAR_EMBEDDING, input_length = CHAR_MAX_LEN) )
character_model.add(Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = 0.1)) )
#character_model.add(LSTM(100))
character_model.add(TimeDistributed(Dense(18, activation='softmax')))

character_model.compile("adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
character_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 50)            3400      
                                                                 
 bidirectional (Bidirectiona  (None, 60, 100)          40400     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 60, 18)           1818      
 ibuted)                                                         
                                                                 
Total params: 45,618
Trainable params: 45,618
Non-trainable params: 0
_________________________________________________________________


In [19]:
#Y_padded3D = Y_padded.reshape(*Y_padded.shape, 1)

In [35]:
import os
import tensorflow as tf
checkpoint_path = "character_model.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
character_model.fit(X_padded, Y_padded, batch_size = 10000, epochs=10, validation_split=0.2, verbose = True, callbacks=[cp_callback])

Epoch 1/10
Epoch 1: saving model to character_model.ckpt
Epoch 2/10
Epoch 2: saving model to character_model.ckpt
Epoch 3/10
Epoch 3: saving model to character_model.ckpt
Epoch 4/10
Epoch 4: saving model to character_model.ckpt
Epoch 5/10
Epoch 5: saving model to character_model.ckpt
Epoch 6/10
Epoch 6: saving model to character_model.ckpt
Epoch 7/10
Epoch 7: saving model to character_model.ckpt
Epoch 8/10
Epoch 8: saving model to character_model.ckpt
Epoch 9/10
Epoch 9: saving model to character_model.ckpt
Epoch 10/10
Epoch 10: saving model to character_model.ckpt


<keras.callbacks.History at 0x7fa0a4db8790>

Prediction Time

In [66]:
X_test_index = [[char2index[char] if char.isascii() else char2index["<UNK>"] for char in sent] for sent in testX]
testX_chunked = []
for i in X_test_index:
  testX_chunked += chunkify(i)
X_test_padded = pad_sequences(
    maxlen=CHAR_MAX_LEN, sequences=testX_chunked, padding="post", value=char2index["<PAD>"])

In [68]:
softmax_X_test = character_model.predict(X_test_padded)



In [84]:
#greedy decoding
def greedy_decoder(data):
  return [np.argmax(s, 1) for s in data]

In [85]:
len(softmax_X_test[0][0])

18

In [91]:
decoded_Xtest = greedy_decoder(softmax_X_test)

In [126]:
allX = []
allY = []
sent = []
predicted = []
start = True
for ci, chunk in enumerate(X_test_padded):
  for ji, j in enumerate(chunk):
    if j == char2index["<S>"]:
      sent.append(X_test_padded[ci][ji])
      predicted.append(decoded_Xtest[ci][ji])
      start = True
    elif j == char2index["</S>"]:
      sent.append(X_test_padded[ci][ji])
      predicted.append(decoded_Xtest[ci][ji])
      start = False
      allX.append(sent)
      allY.append(predicted)
      sent = []
      predicted = []
    elif not start:
      continue
    elif start:
      sent.append(X_test_padded[ci][ji])
      predicted.append(decoded_Xtest[ci][ji])
    


In [131]:
index2char = {i: v for i,v in char2index.items() }

In [142]:
non_diacritized_num = 0
non_diacirtized_accurate = 0
non_diacritized_wrong = defaultdict(lambda: defaultdict(int))
diacritized_num = defaultdict(int)
diacritized_accurate = defaultdict(int)
diacritized_wrong = defaultdict(lambda: defaultdict(int))

In [144]:
for isent, sent in enumerate(testX):
  for ilet, let in enumerate(sent):
    actual = testY[isent][ilet]
    predicted = allY[isent][ilet]
    if actual == 0: #no accents
      non_diacritized_num += 1
      if predicted == 0:
        non_diacirtized_accurate += 1
      else:
        non_diacritized_wrong[let][predicted] += 1 #should not be diacritized but not sure why
    else:
      base = let
      diacr = (actual%100) //10
      tone = actual%6
      actual_diacr = charToDiacritized[base][diacr][tone]
      diacritized_num[ actual_diacr ] += 1
      if actual == predicted:
        diacritized_accurate[ actual_diacr ] += 1
      else:
        pred_base = let
        pred_diacr = (predicted%100) //10
        pred_tone = predicted%6
        predicted_diacr = charToDiacritized[pred_base][pred_diacr][pred_tone]
        diacritized_wrong[ actual_diacr ] [predicted_diacr]  += 1


      




Pickle results for analysis

In [146]:
non_diacirtized_accurate/non_diacritized_num

0.9829220370196247

In [150]:
for i in diacritized_num:
  print(i, diacritized_accurate[i]/diacritized_num[i])

ể 0.0
à 0.7945589803108564
ặ 0.0
ã 0.4474039931748177
ú 0.06709903593339177
ấ 0.0
ễ 0.0
ỡ 0.0
ứ 0.0
á 0.6301920736546908
ữ 0.00045205504222194093
ệ 0.003779146117255669
ị 0.3855119322555812
ò 0.00016073940124573035
ừ 0.0
ì 0.33807798849350096
ự 0.3367738469802778
ó 0.5572591530285136
ă 0.07033414560870581
ẩ 0.0
ồ 0.13610768960056308
ê 0.0
ổ 0.0026729374877523028
ạ 0.4026048807238826
ô 0.02241390506288015
ọ 0.00014334862385321102
ầ 0.0
í 0.48020854423972464
ỗ 0.0
ẵ 0.0
ẳ 0.0
ẻ 0.0
ủ 0.8589781085150402
ả 0.004609351432880844
ư 0.0
â 0.0
ỏ 0.0
ụ 0.0005835918542863286
ớ 0.0
ộ 0.08777378756996344
ũ 0.04026622296173045
ử 0.0
ẫ 0.0
ỉ 0.0
ù 0.24868534913133195
ờ 0.0
ề 0.0
ơ 0.0
ố 0.08418297051529347
ậ 0.0
ợ 0.0
é 0.00014894250819183795
ỹ 0.15812395309882746
ý 0.6736834105935101
ỷ 0.01793570219966159
ẹ 0.0
ở 0.0
ẽ 0.0
ĩ 0.0
ế 0.0
ỳ 0.0
è 0.0
õ 0.0
ỵ 0.0009775171065493646
ằ 0.0
ắ 0.0


In [153]:
diacritized_wrong["ử"]

defaultdict(int,
            {'ự': 19466,
             'u': 13279,
             'ú': 6536,
             'ụ': 624,
             'ủ': 2937,
             'ù': 2113,
             'ũ': 2395,
             'ữ': 116})

In [159]:
#non_diacritized_num = 0
#non_diacirtized_accurate = 0
#non_diacritized_wrong = defaultdict(lambda: defaultdict(int))
#diacritized_num = defaultdict(int)
#diacritized_accurate = defaultdict(int)
#diacritized_wrong = defaultdict(lambda: defaultdict(int))
pickle.dump(non_diacritized_num, open("non_diacritized_num.pkl", "wb"))
pickle.dump(non_diacirtized_accurate, open("non_diacirtized_accurate.pkl", "wb"))
pickle.dump(dict(non_diacritized_wrong), open("non_diacritized_wrong.pkl", "wb"))

pickle.dump(dict(diacritized_num), open("diacritized_num.pkl", "wb"))
pickle.dump(dict(diacritized_accurate), open("diacirtized_accurate.pkl", "wb"))
pickle.dump(dict(diacritized_wrong), open("diacritized_wrong.pkl", "wb"))


In [160]:
pickle.dump(dict(diacritized_num), open("diacritized_num.pkl", "wb"))
pickle.dump(dict(diacritized_accurate), open("diacirtized_accurate.pkl", "wb"))
pickle.dump(dict(diacritized_wrong), open("diacritized_wrong.pkl", "wb"))
