In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import tqdm
import os
import nltk
import numpy as np
import csv
import pandas as pd
import copy

from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Idea 1. Chinese transliteration - taking english words and trying to generate a chinese transliteration which matches with both meaning and sound



In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


# I. Data Preprocessing
For Idea 1.

We need to be able to manipulate/process the pinyin and and phonetic representatios of english.

In [None]:
%pip install eng_to_ipa
%pip install pinyinsplit

import eng_to_ipa as eTi
from pinyinsplit import PinyinSplit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Names

The Names files are mostly already ready, they just have to be tweaked a little bit.

In [None]:
test_names_df = pd.read_csv("drive/MyDrive/NLP_Final_Datasets/raw_test_names.csv")
test_names_df.head()

Unnamed: 0,SRC,CHAR,PINYIN_STR,PINYIN_CHAR
0,s e d g e s,塞 奇 斯,sai qi si,s a i q i s i
1,a n d o l s e k,安 多 尔 塞 克,an duo er sai ke,a n d u o e r s a i k e
2,m a c t r a d e r,麦 克 特 雷 德,mai ke te lei de,m a i k e t e l e i d e
3,s c h e f f e y,谢 菲,xie fei,x i e f e i
4,f o l k,福 克,fu ke,f u k e


In [None]:
train_names_df = pd.read_csv("drive/MyDrive/NLP_Final_Datasets/raw_train_names.csv")
train_names_df.head()

Unnamed: 0,SRC,CHAR,PINYIN_STR,PINYIN_CHAR
0,k a r t m a n,卡 特 曼,ka te man,k a t e m a n
1,p e r s s e,珀 斯,po si,p o s i
2,m a c r a i t h,麦 克 雷 思,mai ke lei si,m a i k e l e i s i
3,h u d a k,赫 达 克,he da ke,h e d a k e
4,k r a u s,克 劳 斯,ke lao si,k e l a o s i


Even though these files are already split into train and test data, we want to randomize them. So, first we're going to recombine them.

In [None]:
names_list = test_names_df["SRC"].tolist() + train_names_df["SRC"].tolist()
names_pinyin = test_names_df["PINYIN_STR"].tolist() + train_names_df["PINYIN_STR"].tolist()

print("Number of name datapoints: ", len(names_list))
print("Number of pinyin datapoints: ", len(names_pinyin))

Number of name datapoints:  52448
Number of pinyin datapoints:  52448


There is still a few things we have to alter about both of the lists we just created.

In [None]:
processed_names = [name.replace(" ", "") for name in names_list]
processed_names_pinyin = [list(pinyin.split(" ")) for pinyin in names_pinyin]

print("The first 5 elements of the processed names:\n", processed_names[:5])
print("The first 5 elements of the processed pinyin:\n", processed_names_pinyin[:5])
print("The last 5 elements of the processed names:\n", processed_names[-5:])
print("The last 5 elements of the processed pinyin:\n", processed_names_pinyin[-5:])

The first 5 elements of the processed names:
 ['sedges', 'andolsek', 'mactrader', 'scheffey', 'folk']
The first 5 elements of the processed pinyin:
 [['sai', 'qi', 'si'], ['an', 'duo', 'er', 'sai', 'ke'], ['mai', 'ke', 'te', 'lei', 'de'], ['xie', 'fei'], ['fu', 'ke']]
The last 5 elements of the processed names:
 ['abernethy', 'cicero', 'frain', 'blaydon', 'guillon']
The last 5 elements of the processed pinyin:
 [['a', 'bo', 'nei', 'xi'], ['xi', 'sai', 'luo'], ['fu', 'lei', 'en'], ['bu', 'lai', 'deng'], ['ji', 'long']]


## Normal Words

The Raw Words file has a lot of things that need to be altered. First the pinyin has to be turned into an easier to work with format. Then, the english words have to be converted to a phonetic format since english is not phonetic itself.

In [None]:
words_df = pd.read_csv("drive/MyDrive/NLP_Final_Datasets/raw_words.csv")
words_df.head()

Unnamed: 0,SRC,PINYIN_STR,CHAR
0,Amen,āmen,阿们
1,ammonia,āmóníyà,阿摩尼亚
2,amoeba,āmǐbā,阿米巴
3,amoxicillin,āmòxīlín,阿莫西林
4,ampere,ānpéi,安培


First we need to split the nonspaced pinyin with tones into pure english alphabet pinyin, spaced by character.

In [None]:
words_list = words_df["SRC"].tolist()
words_pinyin = words_df["PINYIN_STR"].tolist()

tonal_to_flat = {"ā" : "a", "á" : "a", "ǎ" : "a", "à" : "a",
                 "ē" : "e", "é" : "e", "ě" : "e", "è" : "e",
                 "ī" : "i", "í" : "i", "ǐ" : "i", "ì" : "i",
                 "ō" : "o", "ó" : "o", "ǒ" : "o", "ò" : "o",
                 "ū" : "u", "ú" : "u", "ǔ" : "u", "ù" : "u",
                 "ǘ" : "ü", "ǚ" : "ü", "ǜ" : "ü"}

def tonedToSpaced(pinyin_list, replacement_dict):
  pys = PinyinSplit()

  spaced_pinyin_list = list()
  for pinyin in pinyin_list:
    new_pinyin = pinyin.lower().replace("'", "").replace(" ", "")
    for toned, flat in replacement_dict.items():
      new_pinyin = new_pinyin.replace(toned, flat)
    
    spaced_pinyin_possibilites = pys.split(new_pinyin)
    spaced_pinyin = spaced_pinyin_possibilites[0]
    spaced_pinyin_list.append(spaced_pinyin)

  return spaced_pinyin_list

To make sure nothing was lost in the translation above.

In [None]:
processed_words_pinyin = tonedToSpaced(words_pinyin, tonal_to_flat)
print("Number of pinyin datapoints before conversion: ", len(words_pinyin))
print("Number of pinyin datapoints after conversion: ", len(processed_words_pinyin))
print("The first 5 elements of the processed pinyin:\n", processed_words_pinyin[:5])

Number of pinyin datapoints before conversion:  212
Number of pinyin datapoints after conversion:  212
The first 5 elements of the processed pinyin:
 [['a', 'men'], ['a', 'mo', 'ni', 'ya'], ['a', 'mi', 'ba'], ['a', 'mo', 'xi', 'lin'], ['an', 'pei']]


Now that we have the possible outputs, we need the correct inputs. We need to translate english into phonetic sounds.

In [None]:
self_assigned = {"ampoule" : "ˈæmpul", "codeine" : "ˈkoʊdin", "didgeridoo": "ˌdɪʤəriˈdu",
                 "ketamine" : "ˈkɛtəˌmin", "mankini" : "mænˈkini", "mozzarella" : "mɑtsərˈɛlə",
                 "rumba" : "ˈrumbɔ", "sirloin" : "ˈsərlɔɪn"}

def wordToPhonetic(word_list, extra_dict):
  phonetic_list = list()

  for word in word_list:
    phonetic = eTi.convert(word)
    phonetic = phonetic.replace(" ", "ˌ").replace(",", "")
    if phonetic[-1] == "*":
      phonetic = extra_dict.get(word)
    
    phonetic_list.append(phonetic)

  return phonetic_list

To make sure nothing was lost in this process, we check again.

In [None]:
phonetic_words = wordToPhonetic(words_list, self_assigned)
print("Number of english word datapoints before conversion: ", len(words_list))
print("Number of english word datapoints after conversion: ", len(phonetic_words))
print("The first 5 elements of the phonetic words:\n", phonetic_words[:5])

Number of english word datapoints before conversion:  212
Number of english word datapoints after conversion:  212
The first 5 elements of the phonetic words:
 ['ɑˈmɛn', 'əˈmoʊnjə', 'əˈmibə', 'əˈmɔksəˌsɪlɪn', 'ˈæmˌpər']


## Combined Names and Words

Since there are so many fewer data points for the words, we're going to try to add some of the names to the words dataset. If our phonetic translator can turn the names into phonetics, we're going to add it to the word dataset, as well.

We'll create a different dataset for the combined words and names to compare to just the words.**bold text**

In [None]:
def nameToPhon(names_list, names_pinyin_list, phonetics_list, words_pinyin_list):
  combined_phonetics = copy.deepcopy(phonetics_list)
  combined_pinyin_list = copy.deepcopy(words_pinyin_list)

  for index, name in enumerate(names_list):
    phonetics = eTi.convert(name)
    if phonetics[-1] != "*":
      combined_phonetics.append(phonetics)
      combined_pinyin_list.append(names_pinyin_list[index])

  return combined_phonetics, combined_pinyin_list

In [None]:
combined_phonetics, combined_pinyin = nameToPhon(processed_names, processed_names_pinyin, phonetic_words, processed_words_pinyin)
print("Number of combined phonetic datapoints: ", len(combined_phonetics))
print("Number of combined pinyin datapoints: ", len(combined_pinyin))

Number of combined phonetic datapoints:  25318
Number of combined pinyin datapoints:  25318


In [None]:
print("The first 5 elements of the combined phonetic words:\n", combined_phonetics[:5])
print("The first 5 elements of the combined word pinyin:\n", combined_pinyin[:5])
print("The last 5 elements of the combined phonetic words:\n", combined_phonetics[-5:])
print("The last 5 elements of the combined word pinyin:\n", combined_pinyin[-5:])

The first 5 elements of the combined phonetic words:
 ['ɑˈmɛn', 'əˈmoʊnjə', 'əˈmibə', 'əˈmɔksəˌsɪlɪn', 'ˈæmˌpər']
The first 5 elements of the combined word pinyin:
 [['a', 'men'], ['a', 'mo', 'ni', 'ya'], ['a', 'mi', 'ba'], ['a', 'mo', 'xi', 'lin'], ['an', 'pei']]
The last 5 elements of the combined phonetic words:
 ['ˈsænsbɛri', 'ˈæbərˌnɛθi', 'ˈsɪsərˌoʊ', 'freɪn', 'ˈbleɪdən']
The last 5 elements of the combined word pinyin:
 [['sang', 'si', 'bo', 'li'], ['a', 'bo', 'nei', 'xi'], ['xi', 'sai', 'luo'], ['fu', 'lei', 'en'], ['bu', 'lai', 'deng']]


In [None]:
print(type(combined_phonetics), type(combined_pinyin))

<class 'list'> <class 'list'>


In [None]:
combined_phonetics = [[i] for i in combined_phonetics]

In [None]:
import numpy as np
def read_csv_file(path):
    data = []
    with open(path, 'r', encoding = 'utf-8') as f:
        reader = csv.reader(f, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE)    
        for idx, line in enumerate(reader):
            data.append(line)
    
    return data

def list_to_csv(list, csv_file):
  with open(csv_file, 'w', encoding = 'utf-8') as myFile:
      for line in list:
          new_line = (',').join([i for i in line])
          myFile.write(f'{new_line}\n')

In [None]:
list_to_csv(combined_phonetics, 'drive/MyDrive/NLP_Final_Datasets/combined_phonetics.csv')
list_to_csv(combined_pinyin, 'drive/MyDrive/NLP_Final_Datasets/combined_pinyin.csv')

NameError: ignored

In [None]:
combined_phonetics = read_csv_file('drive/MyDrive/NLP_Final_Datasets/combined_phonetics.csv')
combined_pinyin = read_csv_file('drive/MyDrive/NLP_Final_Datasets/combined_pinyin.csv')

In [None]:
combined_phonetics = [i[0] for i in combined_phonetics]

Just to make sure the data we have is acceptable.

In [None]:
print("The first 5 elements of the combined phonetic words:\n", combined_phonetics[:5])
print("The first 5 elements of the combined word pinyin:\n", combined_pinyin[:5])
print("The last 5 elements of the combined phonetic words:\n", combined_phonetics[-5:])
print("The last 5 elements of the combined word pinyin:\n", combined_pinyin[-5:])

The first 5 elements of the combined phonetic words:
 ['ɑˈmɛn', 'əˈmoʊnjə', 'əˈmibə', 'əˈmɔksəˌsɪlɪn', 'ˈæmˌpər']
The first 5 elements of the combined word pinyin:
 [['a', 'men'], ['a', 'mo', 'ni', 'ya'], ['a', 'mi', 'ba'], ['a', 'mo', 'xi', 'lin'], ['an', 'pei']]
The last 5 elements of the combined phonetic words:
 ['ˈsænsbɛri', 'ˈæbərˌnɛθi', 'ˈsɪsərˌoʊ', 'freɪn', 'ˈbleɪdən']
The last 5 elements of the combined word pinyin:
 [['sang', 'si', 'bo', 'li'], ['a', 'bo', 'nei', 'xi'], ['xi', 'sai', 'luo'], ['fu', 'lei', 'en'], ['bu', 'lai', 'deng']]


We currently have 3 datasets to work with:
1. Just English names and the corresponding Chinese Pinyin transliteration
2. Just phonetic representations of English words and the corresponding Chinese Pinyin transliteration
3. A combination of names and words, both in phonetic form and the corresponding Chinese Pinyin transliteration

In [None]:
#@title
print("SUMMARY OF DATASETS")
print("")

# Just names
print("--------------- Just Name Data ---------------")
print("Number of name datapoints: ", len(processed_names))
print("Number of pinyin datapoints: ", len(processed_names_pinyin))
print("The first 5 elements of the processed names:\n", processed_names[:5])
print("The first 5 elements of the processed pinyin:\n", processed_names_pinyin[:5])
print("The last 5 elements of the processed names:\n", processed_names[-5:])
print("The last 5 elements of the processed pinyin:\n", processed_names_pinyin[-5:])
print("")

# Just words
print("--------------- Just Word Data ---------------")
print("Number of word datapoints: ", len(phonetic_words))
print("Number of pinyin datapoints: ", len(processed_words_pinyin))
print("The first 5 elements of the phonetic words:\n", phonetic_words[:5])
print("The first 5 elements of the processed pinyin:\n", processed_words_pinyin[:5])
print("The last 5 elements of the processed names:\n", phonetic_words[-5:])
print("The last 5 elements of the processed pinyin:\n", processed_words_pinyin[-5:])
print("")

# Names and words
print("-------------- Combined Word Data --------------")
print("Number of combined phonetic datapoints: ", len(combined_phonetics))
print("Number of combined pinyin datapoints: ", len(combined_pinyin))
print("The first 5 elements of the combined phonetic words:\n", combined_phonetics[:5])
print("The first 5 elements of the combined phonetic pinyin:\n", combined_pinyin[:5])
print("The last 5 elements of the combined phonetic names:\n", combined_phonetics[-5:])
print("The last 5 elements of the combined phonetic pinyin:\n", combined_pinyin[-5:])

SUMMARY OF DATASETS

--------------- Just Name Data ---------------
Number of name datapoints:  52448
Number of pinyin datapoints:  52448
The first 5 elements of the processed names:
 ['sedges', 'andolsek', 'mactrader', 'scheffey', 'folk']
The first 5 elements of the processed pinyin:
 [['sai', 'qi', 'si'], ['an', 'duo', 'er', 'sai', 'ke'], ['mai', 'ke', 'te', 'lei', 'de'], ['xie', 'fei'], ['fu', 'ke']]
The last 5 elements of the processed names:
 ['abernethy', 'cicero', 'frain', 'blaydon', 'guillon']
The last 5 elements of the processed pinyin:
 [['a', 'bo', 'nei', 'xi'], ['xi', 'sai', 'luo'], ['fu', 'lei', 'en'], ['bu', 'lai', 'deng'], ['ji', 'long']]

--------------- Just Word Data ---------------
Number of word datapoints:  212
Number of pinyin datapoints:  212
The first 5 elements of the phonetic words:
 ['ɑˈmɛn', 'əˈmoʊnjə', 'əˈmibə', 'əˈmɔksəˌsɪlɪn', 'ˈæmˌpər']
The first 5 elements of the processed pinyin:
 [['a', 'men'], ['a', 'mo', 'ni', 'ya'], ['a', 'mi', 'ba'], ['a', 'mo', '

# II. Creating Vocabulary and Datasets



## Vocabulary

We have data split into usable formats. To input them into embedding layers and to get output labels, we need to assign indices to unique characters and unique pinyins of characters.

In [None]:
pad_word = "<pad>"
bos_word = "<start>"
eos_word = "<end>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3

class Vocabulary:
  def __init__(self):
    self.element_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
    self.element_count = {}
    self.id_to_element = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
    self.num_elements = 4
    
  def getIdsFromSequence(self, sequence):
    output_seq = [bos_id]
    for element in sequence:
      id = self.element_to_id.get(element)
      if id is not None:
        output_seq.append(id)
      else:
        output_seq.append(unk_id)

    output_seq.append(eos_id)
    return output_seq
    
  def tokenizedSequence(self, sequence):
    seq_ids = self.getIdsFromSequence(sequence)
    return [self.id_to_element[element_id] for element_id in seq_ids]

  def decodeSequenceFromIds(self, sequence_ids, delimiter):
    elements = list()
    for i, element_id in enumerate(sequence_ids):
      if element_id in [bos_id, eos_id, pad_id]:
        # Skip these words
        continue
      else:
        elements.append(self.id_to_element[element_id])
          
    return delimiter.join(elements)

  def addElementsFromSequence(self, sequence):
    for element in sequence:
      if element not in self.element_to_id:
        # add this word to the vocabulary
        self.element_to_id[element] = self.num_elements
        self.id_to_element[self.num_elements] = element
        self.element_count[element] = 1
        self.num_elements += 1
      else:
        # update the word count
        self.element_count[element] += 1

We now have a vocabulary class we can use for any sort of sequence, be it sequences of characters or sequences of strings. We'll be using this class to assign ids to each English letter, phonetic letter, or pinyin.

In [None]:
name_vocab = Vocabulary()
for name in processed_names:
  name_vocab.addElementsFromSequence(name)

name_pinyin_vocab = Vocabulary()
for pinyin in processed_names_pinyin:
  name_pinyin_vocab.addElementsFromSequence(pinyin)

print("---------------------- Name Data ----------------------")
print("Number of letters in name dictionary: ", name_vocab.num_elements)
print("Number of pinyins in the name pinyin dictionary: ", name_pinyin_vocab.num_elements)

phonetic_vocab = Vocabulary()
for phonetic in phonetic_words:
  phonetic_vocab.addElementsFromSequence(phonetic)

word_pinyin_vocab = Vocabulary()
for pinyin in processed_words_pinyin:
  word_pinyin_vocab.addElementsFromSequence(pinyin)

print("\n-------------------- Phonetic Data --------------------")
print("Number of letters in phonetic dictionary: ", phonetic_vocab.num_elements)
print("Number of pinyins in the phonetic pinyin dictionary: ", word_pinyin_vocab.num_elements)

combined_vocab = Vocabulary()
for phonetic in combined_phonetics:
  combined_vocab.addElementsFromSequence(phonetic)

combined_pinyin_vocab = Vocabulary()
for pinyin in combined_pinyin:
  combined_pinyin_vocab.addElementsFromSequence(pinyin)

print("\n-------------------- Combined Data --------------------")
print("Number of letters in the combined phonetic dictionary: ", combined_vocab.num_elements)
print("Number of pinyins in the combined pinyin dictionary: ", combined_pinyin_vocab.num_elements)

---------------------- Name Data ----------------------
Number of letters in name dictionary:  30
Number of pinyins in the name pinyin dictionary:  272

-------------------- Phonetic Data --------------------
Number of letters in phonetic dictionary:  43
Number of pinyins in the phonetic pinyin dictionary:  169

-------------------- Combined Data --------------------
Number of letters in the combined phonetic dictionary:  44
Number of pinyins in the combined pinyin dictionary:  282


## Datasets

Now that we have the vocabularies for each of the datasets, we can create the datsets themselves.

In [None]:
class engToChin_dataset(Dataset):
  def __init__(self, inputs, outputs, input_vocab, output_vocab, device):
    self.inputs = inputs
    self.outputs = outputs
    self.input_vocab = input_vocab
    self.output_vocab = output_vocab
    self.device = device

    def encode(src, tgt):
      src_ids = self.input_vocab.getIdsFromSequence(src)
      tgt_ids = self.output_vocab.getIdsFromSequence(tgt)
      return (src_ids, tgt_ids)

    self.pairs = list(zip(inputs, outputs))
    self.tokenized_pairs = [encode(src, tgt) for src, tgt in self.pairs]

  def __len__(self):
    return len(self.tokenized_pairs)

  def __getitem__(self, index):
    if torch.is_tensor(index):
      index = index.tolist()

    return {"pair_ids" : self.tokenized_pairs[index], "pairs" : self.pairs[index]}


def collate_fn(data):
  """
  Args:
    data: list of dicts {"pair_ids":(src_ids, tgt_ids), "pairs":(src_str, tgt_list)}.

  Returns:
    dictionary containing:
      pair_ids: (source ids, target ids)
      pairs: (source string, target list)
      pair_tensors: (source tensor of shape (src_padded_length, batch_size), target tensor of shape (tgt_padded_length, batch_size))
  """
  src_ids = [torch.LongTensor(dictionary["pair_ids"][0]) for dictionary in data]
  tgt_ids = [torch.LongTensor(dictionary["pair_ids"][1]) for dictionary in data]
  src_str = [dictionary["pairs"][0] for dictionary in data]
  tgt_list = [dictionary["pairs"][1] for dictionary in data]
  data = list(zip(src_ids, tgt_ids, src_str, tgt_list))
  data.sort(key=lambda x: len(x[0]), reverse=True)
  src_ids, tgt_ids, src_str, tgt_list = zip(*data)

  src_seqs = pad_sequence(src_ids, padding_value=pad_id)
  tgt_seqs = pad_sequence(tgt_ids, padding_value=pad_id)
    
  src_padded_length = len(src_seqs[0])
  tgt_padded_length = len(tgt_seqs[0])
  return {"pair_ids" : (src_ids, tgt_ids), "pairs" : (src_str, tgt_list), "pair_tensors" : (src_seqs.to(device), tgt_seqs.to(device))}

In [None]:
word_dataset = engToChin_dataset(phonetic_words, processed_words_pinyin, phonetic_vocab, word_pinyin_vocab, device)

batch_size = 4

word_data_loader = DataLoader(dataset=word_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

Testing the first batch of data to make sure everything is working as intended.


In [None]:
def print_list(l, K=None):
  # If K is given then only print first K
  for i, e in enumerate(l):
    if i == K:
      break
    print(e)
  print()

first_batch = next(iter(word_data_loader))
print("Testing first training batch of size ", len(first_batch["pairs"][0]))
print("Src:")
print_list(first_batch["pairs"][0])
print("Tokenized src ids:")
print_list(first_batch["pair_ids"][0])
print("Padded src ids as tensor:")
print(first_batch["pair_tensors"][0])

print("Tgt:")
print_list(first_batch["pairs"][1])
print("Tokenized tgt ids:")
print_list(first_batch["pair_ids"][1])
print("Padded tgt ids as tensor:")
print(first_batch["pair_tensors"][1])

Testing first training batch of size  4
Src:
ˈstrɔˌbɛri
hɪˈstɛriə
bəˈzukə
ˈmitər

Tokenized src ids:
tensor([ 1,  5, 17, 25, 23, 15, 18, 14,  7, 23, 13,  2])
tensor([ 1, 40, 19,  5, 17, 25,  7, 23, 13,  9,  2])
tensor([ 1, 14,  9,  5, 32, 26, 16,  9,  2])
tensor([ 1,  5,  6, 13, 25,  9, 23,  2])

Padded src ids as tensor:
tensor([[ 1,  1,  1,  1],
        [ 5, 40, 14,  5],
        [17, 19,  9,  6],
        [25,  5,  5, 13],
        [23, 17, 32, 25],
        [15, 25, 26,  9],
        [18,  7, 16, 23],
        [14, 23,  9,  2],
        [ 7, 13,  2,  0],
        [23,  9,  0,  0],
        [13,  2,  0,  0],
        [ 2,  0,  0,  0]], device='cuda:0')
Tgt:
['shi', 'duo', 'pi', 'li']
['xie', 'si', 'di', 'li']
['ba', 'zu', 'ka']
['mi']

Tokenized tgt ids:
tensor([ 1, 56, 98, 24, 58,  2])
tensor([  1, 114,  22,  53,  58,   2])
tensor([ 1, 10, 40, 41,  2])
tensor([1, 9, 2])

Padded tgt ids as tensor:
tensor([[  1,   1,   1,   1],
        [ 56, 114,  10,   9],
        [ 98,  22,  40,   2],
      

We've just how the code works with the only words dataset. Now we can do the same thing with both our other datasets.

# III. Training the Models

## Create and Train the Model

Now that all the data is in a usable format, the models can be trained.

In [None]:
class Seq2seqTransliteration(nn.Module):
  def __init__(self, input_vocab, output_vocab, is_phonetic, emb_dim=300, hidden_dim=300, num_layers=2, dropout=0.1):
    super().__init__()

    self.input_vocab = input_vocab
    self.output_vocab = output_vocab
    self.num_input_elements = input_vocab.num_elements
    self.num_output_elements = output_vocab.num_elements
    self.is_phonetic = is_phonetic
    self.emb_dim = emb_dim
    self.hidden_dim = hidden_dim
    self.num_layers = num_layers

    self.input_embedding = nn.Embedding(self.num_input_elements, self.emb_dim)
    self.output_embedding = nn.Embedding(self.num_output_elements, self.emb_dim)
    self.encoder_gru = nn.GRU(self.emb_dim, self.hidden_dim, num_layers=self.num_layers, dropout=dropout, bidirectional=True)
    self.decoder_gru = nn.GRU(self.emb_dim, self.hidden_dim, num_layers=self.num_layers, dropout=dropout, bidirectional=False)
    self.linear_attention = nn.Linear(self.hidden_dim, self.hidden_dim)
    self.softmax_attention = nn.Softmax(dim=1)
    self.linear_classifier = nn.Linear(self.hidden_dim, self.num_output_elements)

  def encode(self, src):
    """
    Args:
      source: element indices tensor for the source input with shape (max_src_sequence_length, batch_size)

    Returns:
      A tuple with the following:
        encoder_output: The output of the GRU with shape (max_src_sequence_length, batch_size, hidden_size)
        encoder_hidden: The final hidden state of the biGRU with shape (num_layers, batch_size, hidden_size)
    """

    embeddings = self.input_embedding(src)
    output, h_n = self.encoder_gru(embeddings)

    encoder_output = output[:, :, :output.shape[2] // 2] + output[:, :, output.shape[2] // 2:]
    encoder_hidden = h_n[:h_n.shape[0] // 2, :, :] + h_n[h_n.shape[0] // 2:, :, :]

    return encoder_output, encoder_hidden

  def decode(self, decoder_input, last_hidden, encoder_output):
    """
    Args:
      decoder_input: The element indices tensor for the current decoder input with shape (1, batch_size)
      last_hidden: The tensor representing the last hidden state with shape (num_layers, batch_size, hidden_size)
      encoder_output: The output of the encoder with shape (max_src_sequence_length, batch_size, hidden_size)

    Returns:
      A tuple with the following:
        logits: unnormalized scores for the next element prediction with shape (batch_size, output_vocab_size)
        decoder_hidden: The hidden state output of the GRU with shape (num_layers, batch_size, hidden_size)
        attention_weights: attention_weights with the shape (batch_size, max_src_sequence_length)
    """

    embeddings = self.output_embedding(decoder_input)
    decoder_output, decoder_hidden = self.decoder_gru(embeddings, last_hidden)

    # Using General Attention #
    w_h = self.linear_attention(encoder_output)
    s_w_h = torch.sum(decoder_output * w_h, dim=2)
    attention_score = torch.transpose(s_w_h, 0, 1)
    attention_weights = self.softmax_attention(attention_score)

    unsqueezed_weights = attention_weights.unsqueeze(1)
    context = unsqueezed_weights.bmm(torch.transpose(encoder_output, 0, 1))
    linear_input = torch.transpose(context, 0, 1) + decoder_output

    logits = self.linear_classifier(linear_input)
    logits = logits[0, :, :]

    return logits, decoder_hidden, attention_weights

  def calculateLoss(self, src, tgt):
    """
    Args:
      src: The element indices tensor for the input sequence with shape (max_src_sequence_length, batch_size)
      tgt: The element indices tensor for the output sequence with shape (max_tgt_sequence_length, batch_size)

    Returns:
      loss: scalar float tensor representing cross-entropy loss on the current batch / number of target tokens
    """

    loss_function = nn.CrossEntropyLoss(ignore_index=pad_id, reduction="mean")
    loss = torch.tensor(0.0).to(device)
    teacher_forcing_ratio = 0.9

    encoder_output, current_hidden = self.encode(src)
    decoder_input = torch.unsqueeze(tgt[0, :], dim=0)

    logits_list = list()
    for i in range(1, tgt.shape[0]):
      logits, decoder_hidden, _ = self.decode(decoder_input, current_hidden, encoder_output)
      logits_list.append(logits)

      current_hidden = decoder_hidden
      if np.random.rand() < teacher_forcing_ratio:
        decoder_input = torch.unsqueeze(tgt[i, :], 0)
      else:
        predicted_id = torch.argmax(logits, dim=1)
        decoder_input = predicted_id.unsqueeze(0)
        
    logits_concat = torch.cat(logits_list, dim=0)
    loss = loss_function(logits_concat, torch.flatten(tgt[1:, :]))
    return loss

We have our model class. We also need to be able to train it.

In [None]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0005):
  """Train the model for given number of epochs and save the trained model in 
  the final model_file.
  """

  decoder_learning_ratio = 5.0
    
  encoder_parameter_names = ['input_embedding', 'encoder_gru']
                               
  encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
  decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
  encoder_params = [e[1] for e in encoder_named_params]
  decoder_params = [e[1] for e in decoder_named_params]
  optimizer = torch.optim.AdamW([{'params': encoder_params},
              {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
  clip = 50.0
  for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
    with tqdm.notebook.tqdm(
      data_loader,
      desc="epoch {}".format(epoch + 1),
      unit="batch",
      total=len(data_loader)) as batch_iterator:
      model.train()
      total_loss = 0.0
      for i, batch_data in enumerate(batch_iterator, start=1):
        source, target = batch_data["pair_tensors"]
        optimizer.zero_grad()
        loss = model.calculateLoss(source, target)
        total_loss += loss.item()
        loss.backward()
        # Gradient clipping before taking the step
        _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())

  torch.save(model.state_dict(), model_file)

In [None]:
num_epochs = 10
batch_size = 50

word_dataset = engToChin_dataset(phonetic_words, processed_words_pinyin, phonetic_vocab, word_pinyin_vocab, device)
word_data_loader = DataLoader(dataset=word_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

phonetic_model = Seq2seqTransliteration(phonetic_vocab, word_pinyin_vocab, is_phonetic=True).to(device)
train(phonetic_model, word_data_loader, num_epochs, "phonetic_model.pt")
files.download("phonetic_model.pt")

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 9:   0%|          | 0/5 [00:00<?, ?batch/s]

epoch 10:   0%|          | 0/5 [00:00<?, ?batch/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
phonetic_model = Seq2seqTransliteration(phonetic_vocab, word_pinyin_vocab, is_phonetic=True).to(device)
phonetic_model.load_state_dict(torch.load("drive/MyDrive/NLP_Final_Datasets/phonetic_model.pt", map_location=device))

<All keys matched successfully>

In [None]:
num_epochs = 10
batch_size = 50

combined_dataset = engToChin_dataset(combined_phonetics, combined_pinyin, combined_vocab, combined_pinyin_vocab, device)
combined_data_loader = DataLoader(dataset=combined_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

combined_model = Seq2seqTransliteration(combined_vocab, combined_pinyin_vocab, is_phonetic=True).to(device)
train(combined_model, combined_data_loader, num_epochs, "combined_model.pt")
files.download("combined_model.pt")

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 9:   0%|          | 0/507 [00:00<?, ?batch/s]

epoch 10:   0%|          | 0/507 [00:00<?, ?batch/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
combined_model = Seq2seqTransliteration(combined_vocab, combined_pinyin_vocab, is_phonetic=True).to(device)
combined_model.load_state_dict(torch.load("drive/MyDrive/NLP_Final_Datasets/combined_model.pt", map_location=device))

<All keys matched successfully>

In [None]:
num_epochs = 10
batch_size = 50

name_dataset = engToChin_dataset(processed_names, processed_names_pinyin, name_vocab, name_pinyin_vocab, device)
name_data_loader = DataLoader(dataset=name_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

name_model = Seq2seqTransliteration(name_vocab, name_pinyin_vocab, is_phonetic=False).to(device)
train(name_model, name_data_loader, num_epochs, "name_model.pt")
files.download("name_model.pt")

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 9:   0%|          | 0/1049 [00:00<?, ?batch/s]

epoch 10:   0%|          | 0/1049 [00:00<?, ?batch/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
name_model = Seq2seqTransliteration(name_vocab, name_pinyin_vocab, is_phonetic=False).to(device)
name_model.load_state_dict(torch.load("drive/MyDrive/NLP_Final_Datasets/name_model.pt", map_location=device))

<All keys matched successfully>

## Predicting Transliterations

We'll be using greedy evaluations to check transliterations from input.

In [None]:
def predict_greedy(model, input_vocab, output_vocab, sequence, max_length=10):
  model.eval()

  sequence = sequence.lower()
  if model.is_phonetic:
    sequence = eTi.convert(sequence)
    if sequence[-1] == "*":
      return None
  
  element_ids = input_vocab.getIdsFromSequence(sequence)
  source = torch.transpose(torch.tensor([element_ids]), 0, 1)
  encoder_output, encoder_hidden = model.encode(source.to(device))
  decoder_input = (torch.ones(1, 1).int() * bos_id).to(device)

  current_hidden = encoder_hidden
  response_ids = list()
  for i in range(max_length):
    logits, decoder_hidden, _ = model.decode(decoder_input, current_hidden, encoder_output)
    predicted_id = torch.argmax(logits, dim=1)
    response_ids.append(predicted_id.item())
    if predicted_id == eos_id:
      break

    decoder_input = predicted_id.unsqueeze(0)
    current_hidden = decoder_hidden

  return output_vocab.decodeSequenceFromIds(response_ids, delimiter=" ")

In [None]:
def predict_beam(model, input_vocab, output_vocab, sequence, k=3, max_length=10):
    alpha = 0.7
    model.eval()
    
    softmax = nn.Softmax(dim=1)
    
    word_ids = input_vocab.getIdsFromSequence(sequence.lower())
    source = torch.transpose(torch.tensor([word_ids]), 0, 1)
    encoder_output, encoder_hidden = model.encode(source.to(device))
    decoder_input = (torch.ones(1, 1).int() * bos_id).to(device)

    beam = [(torch.log(torch.tensor(1)), [bos_id], encoder_hidden)]
    
    finished_seqs = list()
    for seq_length in range(max_length):
      new_beam = list()
      for beam_idx in range(len(beam)):
        beam_likelihood, sequence, current_hidden = beam[beam_idx]
        decoder_input = torch.tensor([sequence[-1]]).unsqueeze(0).to(device)

        if decoder_input.squeeze(0).item() == eos_id:
          score = beam_likelihood / (seq_length**alpha)
          finished_seqs.append((score, sequence))
          continue

        logits, decoder_hidden, _ = model.decode(decoder_input, current_hidden, encoder_output)
        predicted_probs = softmax(logits)
        log_likelihoods, predicted_ids = torch.topk(torch.log(predicted_probs), k)
        log_likelihoods = log_likelihoods.squeeze(0)
        predicted_ids = predicted_ids.squeeze(0)

        for id_idx in range(predicted_ids.shape[0]):
          updated_sequence = sequence + [predicted_ids[id_idx].item()]
          new_beam.append((beam_likelihood + log_likelihoods[id_idx], updated_sequence, decoder_hidden))
      
      new_beam.sort(reverse=True)
      beam = new_beam[:k]

    finished_seqs.sort(reverse=True)
    finished_seqs = finished_seqs[:k]
    results = [output_vocab.decodeSequenceFromIds(sequence, delimiter=" ") for score, sequence in finished_seqs]
    return results

In [None]:
def getTransliterations(model, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sequence = ''
    while(1):
        # Get input sentence
        input_sequence = input('Input > ')
        # Check if it is quit case
        if input_sequence == 'q' or input_sequence == 'quit': break
        
        generation = predict_f(model, model.input_vocab, model.output_vocab, input_sequence)
        if generation is None:
          print("Your input doesn't have a phonetic representation. Try another word.")
          continue

        if mode == "beam":
            generation = generation[0]
        print('Greedy Response:', generation)
        print()
        chat_log.append((input_sequence, generation))
    return chat_log

# IV. Evaluating the Models

Now that the models have been trained, we can see how well they work.

In [None]:
name_chat = getTransliterations(name_model)

Input > ian ferguson
Greedy Response: yang fu ge sen

Input > ian
Greedy Response: ai an

Input > ferguson
Greedy Response: fei gu sen

Input > quit


In [None]:
name_chat_beam = getTransliterations(name_model, mode="beam")

Input > national
Greedy Response: na shen ao na

Input > technology
Greedy Response: te ke nuo tuo yi

Input > president
Greedy Response: pu lei xi deng

Input > quit


In [None]:
phonetic_chat = getTransliterations(phonetic_model)

Input > quit


In [None]:
combined_chat = getTransliterations(combined_model)

Input > coma
Greedy Response: ke ma

Input > quit


In [None]:
combined_chat_beam = getTransliterations(combined_model, mode="beam")

Input > national
Greedy Response: nai di er nuo

Input > technology
Greedy Response: tan pu luo ge

Input > president
Greedy Response: pu lei sen te

Input > additional
Greedy Response: dai de nai er

Input > government
Greedy Response: ge wen te

Input > quit


# V. Turning Pinyin into Meaningful Characters

We get useful Pinyin. Now we want to turn it into meaningful characters, that correspond with the meaning of the inputs.

In [None]:
%pip install fasttext
%pip install pinyin

import fasttext
import fasttext.util
import gensim
import requests
import pinyin as pinyin_converter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Embedding Comparison

To find the closest semantic match between English and Chinese, we need some sort of pre-aligned word vectors. We'll be using the pre-trained fasttext word embeddings. This is a context-less embedding system. Since we're not working with contexts, fasttext seemed like an appropriate embedding scheme.

In [None]:
chin_ft = gensim.models.KeyedVectors.load_word2vec_format("drive/MyDrive/NLP_Final_Datasets/wiki.zh.align.vec")
eng_ft = gensim.models.KeyedVectors.load_word2vec_format("drive/MyDrive/NLP_Final_Datasets/wiki.en.align.vec")

In [None]:
chin_love_embed = np.array(chin_ft["爱"])
eng_love_embed = np.array(eng_ft["love"])
chin_protest_embed = np.array(chin_ft["抗议"])
eng_protest_embed = np.array(eng_ft["protest"])
print(chin_love_embed @ eng_love_embed.T)
print(chin_love_embed @ eng_protest_embed.T)
print(chin_protest_embed @ eng_protest_embed.T)
print(chin_protest_embed @ eng_love_embed.T)

0.38272935
0.14973044
0.44158804
-0.0013554336


We have embeddings for both chinese characters and English words. Using this, we can find the highest dot product (most similar meaning) using the pinyin that we already calculated. Now we just have to turn the pinyin we already generated back into possible characters to be evaluated.

In [None]:
URL = "https://www.google.com/inputtools/request?ime=pinyin&ie=utf-8&oe=utf-8&app=translate&num=10&text="

pinyin = "shen"

response = requests.get(url=URL+pinyin)
characters_list = response.json()[1][0][1]
print(characters_list)

def verifyPinyin(pinyin, characters_list):
  actual_list = list()
  for character in characters_list:
    actual_pinyin = pinyin_converter.get(character, format="strip", delimiter="")
    if actual_pinyin == pinyin:
      actual_list.append(character)

  return actual_list

print(verifyPinyin(pinyin, characters_list))

['神', '深', '沈', '身', '申', '甚', '肾', '什', '审', '伸']
['神', '深', '沈', '身', '申', '甚', '肾', '审', '伸']


In [None]:
def chinClosestMatch(chin_vec, eng_vec, eng_word, pinyin):
  eng_word_embed = torch.tensor(eng_vec[eng_word])
  closest_match = None

  pinyin_list = pinyin.split(" ")
  characters_list = list()

  for pinyin_str in pinyin_list:
    response = requests.get(url=URL+pinyin_str)
    possible_characters = response.json()[1][0][1]
    actual_characters = verifyPinyin(pinyin_str, possible_characters)
    characters_list.append(actual_characters)

  possible_strings, possible_embeds = getSequencesEmbeds(characters_list, chin_vec)
  embed_tensor = torch.cat(possible_embeds, dim=0)

  dots = torch.sum(embed_tensor * eng_word_embed, dim=1)
  closest_match_index = torch.argmax(dots)
  closest_match = possible_strings[closest_match_index]

  return closest_match

def getSequencesEmbeds(listicus, chin_vec):
  possible_strings = [""]
  possible_embeds = torch.zeros(1, 300)
  for listicus_child in listicus:
    new_possible_strings = list()
    new_possible_embeds = list()
    for character in listicus_child:
      try:
        character_embed = torch.tensor(chin_vec[character]).unsqueeze(0)
      except KeyError:
        continue
      
      for i in range(len(possible_strings)):
        new_possible_strings.append(possible_strings[i] + character)
        new_possible_embeds.append(possible_embeds[i] + character_embed)
    possible_strings = new_possible_strings
    possible_embeds = new_possible_embeds

  return possible_strings, possible_embeds

To test that our methods are working correctly

In [None]:
example_list = [["爱", "矮"], ["被", "杯"], ["次", "词"]]
strings, embeds = getSequencesEmbeds(example_list, chin_ft)
print(strings)
print(embeds[0].shape)
print(torch.cat(embeds, dim=0).shape)

characters = chinClosestMatch(chin_ft, eng_ft, "love", "ai qing")
print(characters)

['爱被次', '矮被次', '爱杯次', '矮杯次', '爱被词', '矮被词', '爱杯词', '矮杯词']
torch.Size([1, 300])
torch.Size([8, 300])
愛情


## Getting Output for Evaluation

We can do something similar to what we had above and include the most accurate characters this time.

In [None]:
def getTransliterationsWithChar(model, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sequence = ''
    while(1):
        # Get input sentence
        input_sequence = input('Input > ')
        # Check if it is quit case
        if input_sequence == 'q' or input_sequence == 'quit': break
        
        generation = predict_f(model, model.input_vocab, model.output_vocab, input_sequence)
        if generation is None:
          print("Your input doesn't have a phonetic representation. Try another word.")
          continue

        if mode == "beam":
            generation = generation[0]

        char_generation = chinClosestMatch(chin_ft, eng_ft, input_sequence, generation)

        print("Greedy Response:", generation)
        print("Most Accurate Characters:", char_generation)
        print()
        chat_log.append((input_sequence, generation))
    return chat_log

In [None]:
combined_chat = getTransliterationsWithChar(combined_model)

Input > computer
Greedy Response: kang bo te
Most Accurate Characters: 康博特

Input > payment
Greedy Response: pei men te
Most Accurate Characters: 赔們特

Input > product
Greedy Response: pu luo de ke te
Most Accurate Characters: 扑络的可鋱

Input > business
Greedy Response: bi si ni si
Most Accurate Characters: 必私呢私

Input > quit


In [None]:
def test_TransliterationsWithChar(model, test_data, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sequence = ''
    for word in test_data:
        # print('word: ', word)
        generation = predict_f(model, model.input_vocab, model.output_vocab, word[0])
        if generation is None:
          print("Your input doesn't have a phonetic representation. Try another word.")
          continue

        if mode == "beam":
            generation = generation[0]

        char_generation = chinClosestMatch(chin_ft, eng_ft, word[0], generation)

        chat_log.append([word[0], generation, char_generation])
    return chat_log

In [None]:
import numpy as np
def read_csv_file(path):
    
    data = []
    
    with open(path, 'r', encoding = 'utf-8') as f:
        reader = csv.reader(f, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE)    
        for idx, line in enumerate(reader):
            data.append(line)
    
    return data

def list_to_csv(list, csv_file):
  with open(csv_file, 'w', encoding = 'utf-8') as myFile:
      for line in list:
          new_line = (',').join([i for i in line])
          myFile.write(f'{new_line}\n')

In [None]:
test_words_1 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_1.csv")
test_words_2 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_2.csv")
test_words_3 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_3.csv")
test_words_4 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_4.csv")
test_words_5 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_5.csv")
test_words_6 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_6.csv")
test_words_7 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_7.csv")
test_words_8 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_8.csv")
test_words_9 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_9.csv")
test_words_10 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_10.csv")
test_words_10_1 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_10_1.csv")
test_words_10_2 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_10_2.csv")
test_words_10_3 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_10_3.csv")
test_words_11 = read_csv_file("drive/MyDrive/NLP_Final_Datasets/test_words_list_11.csv")

In [None]:
test_words = test_words_1 + test_words_2 + test_words_3 + test_words_4 + test_words_5 + test_words_6 + test_words_7 + test_words_8 + test_words_9 + test_words_10 + test_words_11

NameError: ignored

In [None]:
phonetic_model = Seq2seqTransliteration(phonetic_vocab, word_pinyin_vocab, is_phonetic=True).to(device)
phonetic_model.load_state_dict(torch.load("phonetic_model.pt", map_location=device))

<All keys matched successfully>

In [None]:
combined_model = Seq2seqTransliteration(combined_vocab, combined_pinyin_vocab, is_phonetic=True).to(device)
combined_model.load_state_dict(torch.load("combined_model.pt", map_location=device))

<All keys matched successfully>

In [None]:
name_model = Seq2seqTransliteration(name_vocab, name_pinyin_vocab, is_phonetic=False).to(device)
name_model.load_state_dict(torch.load("name_model.pt", map_location=device))

<All keys matched successfully>

In [None]:
results_phonetic_model = test_TransliterationsWithChar(phonetic_model, test_words)
print(len(results_phonetic_model))
print(test_words[:5])
print(results_phonetic_model[:5])
np.savetxt("results_phonetic_model_trans2char.csv", results_phonetic_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

124
[['that'], ['only'], ['when'], ['next'], ['used']]
[['that', 'tu te', '兔特'], ['only', 'ni long', '你隆'], ['when', 'ni long', '你隆'], ['next', 'ni ke', '你可'], ['used', 'you li', '由理']]


In [None]:
results1_combined_model = test_TransliterationsWithChar(combined_model, test_words_1)
print("results_combined_model: ", results1_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results1_combined_model_trans2char.csv", results1_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results_combined_model:  [['that', 'ta te', '它特'], ['only', 'ang li', '昂例'], ['when', 'wen', '問'], ['next', 'ke ke si te', '可可四特'], ['used', 'ai zi', '矮自']]


In [None]:
results2_combined_model = test_TransliterationsWithChar(combined_model, test_words_2)
print("results2_combined_model: ", results2_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results2_combined_model_trans2char.csv", results2_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results2_combined_model:  [['want', 'wan te', '玩鋱'], ['long', 'lang', '廊'], ['code', 'ke de', '刻的'], ['even', 'ai wen', '挨問'], ['sign', 'sai en', '嗮摁']]


In [None]:
results3_combined_model = test_TransliterationsWithChar(combined_model, test_words_3)
print("results3_combined_model: ", results3_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results3_combined_model_trans2char.csv", results3_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results3_combined_model:  [['video', 'wei di ao', '微第傲'], ['where', 'wei er', '位而'], ['books', 'bu ke si', '簿刻私'], ['links', 'lin ke si', '麟可四'], ['years', 'ai er si', '唉而死']]


In [None]:
results4_combined_model = test_TransliterationsWithChar(combined_model, test_words_4)
print("results4_combined_model: ", results4_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results4_combined_model_trans2char.csv", results4_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results4_combined_model:  [['store', 'si tuo er', '私拓而'], ['terms', 'te mu si', '特慕私'], ['local', 'luo ke er', '落客而'], ['those', 'suo si', '所私'], ['using', 'you xin', '由芯']]


In [None]:
results5_combined_model = test_TransliterationsWithChar(combined_model, test_words_5)
print("results5_combined_model: ", results5_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results5_combined_model_trans2char.csv", results5_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results5_combined_model:  [['south', 'suo si', '所斯'], ['pages', 'pei qi si', '培其私'], ['found', 'fang de', '仿的'], ['photo', 'fu tuo', '福拖'], ['cards', 'ka zi', '咖梓']]


In [None]:
results6_combined_model = test_TransliterationsWithChar(combined_model, test_words_6)
print("results6_combined_model: ", results6_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results6_combined_model_trans2char.csv", results6_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results6_combined_model:  [['people', 'pi pu er', '毗普而'], ['health', 'he er si', '和而司'], ['should', 'shu de', '熟的'], ['system', 'xi si te han', '系司特涵'], ['comment', 'kang men te', '扛悶鋱']]


In [None]:
results7_combined_model = test_TransliterationsWithChar(combined_model, test_words_7)
print("results7_combined_model: ", results7_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results7_combined_model_trans2char.csv", results7_combined_model, delimiter=", ", fmt="% s",encoding = 'utf-8')

results7_combined_model:  [['release', 'li li si', '力力死'], ['request', 'li ke wei si te', '理可未似特'], ['picture', 'pi ke qiu', '劈可秋'], ['meeting', 'mi ting', '秘庭'], ['similar', 'xi mu le', '喜母了']]


In [None]:
results8_combined_model = test_TransliterationsWithChar(combined_model, test_words_8)
print("results8_combined_model: ", results8_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results8_combined_model_trans2char.csv", results8_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results8_combined_model:  [['business', 'bi si ni si', '必私呢私'], ['services', 'se wei si', '嗇未私'], ['products', 'pu luo de ke si', '普络的可私'], ['software', 'suo fu wei er', '所付微而'], ['research', 'li ze qi', '理則其']]


In [None]:
results9_combined_model = test_TransliterationsWithChar(combined_model, test_words_9)
print("results9_combined_model: ", results9_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results9_combined_model_trans2char.csv", results9_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results9_combined_model:  [['download', 'tang wo er', '唐我兒'], ['equipment', 'yi ke men te', '亦可們鋱'], ['important', 'ying bo dun', '迎波盾'], ['something', 'sa mu xin', '仨幕心'], ['committee', 'kao mi di', '靠秘第']]


In [None]:
results10_combined_model = test_TransliterationsWithChar(combined_model, test_words_10)
print("results10_combined_model: ", results10_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_combined_model_trans2char.csv", results10_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

In [None]:
results10_1_combined_model = test_TransliterationsWithChar(combined_model, test_words_10_1)
print("results10_1_combined_model: ", results10_1_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_1_combined_model_trans2char.csv", results10_1_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results10_1_combined_model:  [['government', 'ge wen te man', '各文特曼'], ['department', 'de pa men te', '的帕們特'], ['categories', 'ka ta ge si', '咖它各四'], ['conditions', 'kong di si shen', '控低死甚']]


In [None]:
results10_2_combined_model = test_TransliterationsWithChar(combined_model, test_words_10_2)
print("results10_2_combined_model: ", results10_2_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_2_combined_model_trans2char.csv", results10_2_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

KeyboardInterrupt: ignored

In [None]:
# for word in test_words_10_2[1:]:
#   print(test_TransliterationsWithChar(combined_model, [word]))

print(test_TransliterationsWithChar(combined_model, [test_words_10_2[0]]))

In [None]:
for word in test_words_11[:-1]:
  print(test_TransliterationsWithChar(combined_model, [word]))

[['application', 'a pu la ken han', '阿蒲辣墾含']]
[['performance', 'po fu lan si', '破付揽私']]
[['professional', 'pu luo fei er she', '普骆非而涉']]


In [None]:
results10_3_combined_model = test_TransliterationsWithChar(combined_model, test_words_10_3)
print("results10_3_combined_model: ", results10_3_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_3_combined_model_trans2char.csv", results10_3_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results10_3_combined_model:  [['information', 'yin fu mei sen', '因付沒森'], ['development', 'di wei er te lan', '低微而特揽'], ['description', 'di ke si lin pu', '滴柯似林谱'], ['accessories', 'a ke se si li si', 'a可色私礼私']]


In [None]:
results11_combined_model = test_TransliterationsWithChar(combined_model, test_words_11)
print("results11_combined_model: ", results11_combined_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results11_combined_model_trans2char.csv", results11_combined_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

In [None]:
results1_name_model = test_TransliterationsWithChar(name_model, test_words_1)
print("results1_name_model: ", results1_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results1_name_model_trans2char.csv", results1_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results1_name_model:  [['that', 'sa te', '薩特'], ['only', 'tang li', '唐例'], ['when', 'hui en', '會恩'], ['next', 'nei ke si te', '內可四特'], ['used', 'you sai de', '由塞的']]


In [None]:
results2_name_model = test_TransliterationsWithChar(name_model, test_words_2)
print("results2_name_model: ", results2_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results2_name_model_trans2char.csv", results2_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')


results2_name_model:  [['want', 'wan te', '玩鋱'], ['long', 'lang', '廊'], ['code', 'ke de', '刻的'], ['even', 'ai wen', '挨問'], ['sign', 'xi en', '西摁']]


In [None]:
results3_name_model = test_TransliterationsWithChar(name_model, test_words_3)
print("results3_name_model: ", results3_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results3_name_model_trans2char.csv", results3_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')


results3_name_model:  [['video', 'wei de ao', '微的傲'], ['where', 'hui er', '會而'], ['books', 'bu ke si', '簿刻私'], ['links', 'lin ke si', '麟可四'], ['years', 'yi er si', '已而死']]


In [None]:
results4_name_model = test_TransliterationsWithChar(name_model, test_words_4)
print("results4_name_model: ", results4_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results4_name_model_trans2char.csv", results4_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')


results4_name_model:  [['store', 'si tuo er', '私拓而'], ['terms', 'te mu si', '特慕私'], ['local', 'luo kao er', '落靠而'], ['those', 'suo si', '所私'], ['using', 'you xin', '由芯']]


In [None]:
results5_name_model = test_TransliterationsWithChar(name_model, test_words_5)
print("results5_name_model: ", results5_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results5_name_model_trans2char.csv", results5_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')


results5_name_model:  [['south', 'sa si', '薩斯'], ['pages', 'pei qi si', '培其私'], ['found', 'fang de', '仿的'], ['photo', 'bo tuo', '博拖'], ['cards', 'ka zi', '咖梓']]


In [None]:
results6_name_model = test_TransliterationsWithChar(name_model, test_words_6)
print("results6_name_model: ", results6_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results6_name_model_trans2char.csv", results6_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results6_name_model:  [['people', 'pu pu er', '普普而'], ['health', 'xi er si', '吸而司'], ['should', 'shu er de', '熟而的'], ['system', 'xi si te mu', '系司特木'], ['comment', 'ke men te', '客悶鋱']]


In [None]:
results6_name_model = test_TransliterationsWithChar(name_model, test_words_6)
print("results6_name_model: ", results6_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results6_name_model_trans2char.csv", results6_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results6_name_model:  [['people', 'pu pu er', '普普而'], ['health', 'xi er si', '吸而司'], ['should', 'shu er de', '熟而的'], ['system', 'xi si te mu', '系司特木'], ['comment', 'ke men te', '客悶鋱']]


In [None]:
results7_name_model = test_TransliterationsWithChar(name_model, test_words_7)
print("results7_name_model: ", results7_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results7_name_model_trans2char.csv", results7_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results7_name_model:  [['release', 'li li si', '力力死'], ['request', 'li kai si te', '理開似特'], ['picture', 'pi qiu er', '劈秋而'], ['meeting', 'mi ting', '秘庭'], ['similar', 'xi mi le', '喜迷了']]


In [None]:
results8_name_model = test_TransliterationsWithChar(name_model, test_words_8)
print("results8_name_model: ", results8_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results8_name_model_trans2char.csv", results8_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results8_name_model:  [['business', 'bu xi ni si', '不洗呢私'], ['services', 'se wei xi si', '嗇未西私'], ['products', 'pu luo du ke te', '普络都可鋱'], ['software', 'suo fu te wa er', '所付特挖而'], ['research', 'li qi', '理其']]


In [None]:
results9_name_model = test_TransliterationsWithChar(name_model, test_words_9)
print("results9_name_model: ", results9_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results9_name_model_trans2char.csv", results9_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results9_name_model:  [['download', 'tang luo de', '唐落的'], ['equipment', 'yi kui pu men', '亦奎谱們'], ['important', 'yin bo dun', '因波盾'], ['something', 'sa mi xin', '仨咪心'], ['committee', 'ke mi di wa', '柯秘第佤']]


In [None]:
results10_name_model = test_TransliterationsWithChar(name_model, test_words_10)
print("results10_name_model: ", results10_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_name_model_trans2char.csv", results10_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

In [None]:
results10_1_name_model = test_TransliterationsWithChar(name_model, test_words_10_1)
print("results10_1_name_model: ", results10_1_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_1_name_model_trans2char.csv", results10_1_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results10_1_name_model:  [['government', 'ge wen men te', '各文們特'], ['department', 'de pa te men te', '的帕特們特'], ['categories', 'ka te ge li si', '咖特各例四'], ['conditions', 'kang di shen si', '抗低甚死']]


In [None]:
results10_2_name_model = test_TransliterationsWithChar(name_model, test_words_10_2)
print("results10_2_name_model: ", results10_2_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_2_name_model_trans2char.csv", results10_2_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results10_2_name_model:  [['experience', 'ai ke si lin si', '愛可思霖思'], ['activities', 'a ke di wei si', '啊可地未私'], ['additional', 'a di you nei tuo er', '阿抵有內拖而'], ['california', 'ka li fu ni ya', '喀利福尼崖']]


In [None]:
results10_3_name_model = test_TransliterationsWithChar(name_model, test_words_10_3)
print("results10_3_name_model: ", results10_3_name_model[:5])
np.savetxt("drive/MyDrive/NLP_Final_Datasets/results10_3_name_model_trans2char.csv", results10_3_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

results10_3_name_model:  [['information', 'ying fu ma xi ma', '应付嗎希嗎'], ['development', 'de fu luo men men te', '的副落們們特'], ['description', 'de si ke li ting', '的似柯理挺'], ['accessories', 'a ke suo li si', 'a可所礼私']]


In [None]:
# results11_name_model = test_TransliterationsWithChar(name_model, test_words_11)
# print("results11_name_model: ", results11_name_model[:5])
# np.savetxt("drive/MyDrive/NLP_Final_Datasets/results11_name_model_trans2char.csv", results11_name_model, delimiter=", ", fmt="% s", encoding = 'utf-8')

for word in test_words_11:
  print(test_TransliterationsWithChar(name_model, [word]))

[['application', 'a pu li ka si ka', '阿蒲例咖私咖']]
[['performance', 'po fu man', '破付慢']]
[['professional', 'pu luo fei na er', '普骆非哪而']]
