# Word translation for Chinese - English with Singular Value Decomposition

Experiments with the "fastText_multilingual" code repo by Babylon Health, using a ZH - EN task with a dictionary provided by Facebook research as training and test data.

In [1]:
import numpy as np
from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)
    print('source_matrix.shape target_matrix.shape', source_matrix.shape, target_matrix.shape)
    # perform the SVD
    product = np.matmul(source_matrix[0].transpose(), target_matrix[1])
    #There is one error in the ipnyp notebook -> one needs to slice the matrix, since it's a stack of matrices
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

The word vectors are the 50k first ones from pretrained fastText embeddings. First, we download the pretrained ones. Switch into your favorite terminal window and execute the following:
<br>
> curl -Lo wiki.zh.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.zh.vec
<br>
> curl -Lo wiki.en.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
<br>

After downloading the word vectors, perform the following commands:
<br>
> head -50000 wiki.zh.vec > wiki.zh.50k.vec
<br>
> head -50000 wiki.en.vec > wiki.en.50k.vec

<br>
This speeds up computation and puts less data in your RAM.

In [2]:
'''
fr_dictionary = FastVector(vector_file='wiki.fr.vec')
ru_dictionary = FastVector(vector_file='wiki.ru.vec')

fr_vector = fr_dictionary["chat"]
ru_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))
'''
zh_dictionary = FastVector(vector_file='../Georgina/master-thesis/data/wiki.zh.50k.vec')
en_dictionary = FastVector(vector_file='../Georgina/master-thesis/data/wiki.en.50k.vec')

en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["中"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

reading word vectors from ../Georgina/master-thesis/data/wiki.zh.50k.vec
reading word vectors from ../Georgina/master-thesis/data/wiki.en.50k.vec
-0.026612317462


Download the bilingual dictionary training and test data provided by Facebook Research.
Switch into bash and execute
<br>
> curl -Lo zh-en.0-5000.txt https://s3.amazonaws.com/arrival/dictionaries/zh-en.0-5000.txt 

<br>
This is a pretty standard 5k dictionary which is a good size for training our mapping.

In [3]:
import sys
#LDC and CEDICT shuffled data set
#trainf = '../Georgina/master-thesis/data/zh/ldc-cedict-no-duplicates-shuffled-train-single-words.txt'
trainf = 'zh-en.0-5000.txt'
#Open the training data (bilingual dictionary)
def read_dict(dict_file):
    return [tuple(line.strip().split()) for line in open(dict_file)]
train_data = read_dict(trainf)
#These are the source and target words
#source_words, target_words = zip(*train_data)
print(train_data[:10])
print(len(train_data)) #This dictionary is 32012 lines long

[('年', 'year'), ('月', 'moon'), ('月', 'months'), ('月', 'month'), ('日', 'day'), ('和', 'and'), ('村', 'village'), ('人', 'man'), ('人', 'people'), ('%', '%')]
8891


Load the ZH - EN dict.

In [4]:
print("Applying the transformation.")
# form the training matrices
#source_matrix, target_matrix = make_training_matrices(
#    en_dictionary, zh_dictionary, bilingual_dictionary)

#I'm doing a ZH - EN translation task here
source_matrix = target_matrix = make_training_matrices(zh_dictionary, en_dictionary, train_data)
#print('source_matrix shape: ', type(source_matrix), np.ndim(source_matrix))
#print('target_matrix shape: ', type(target_matrix), np.ndim(target_matrix))
#print('source_dictionary: ', zh_dictionary.shape)
#print('target dictionary: ', en_dictionary.shape)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
zh_dictionary.apply_transform(transform)

print("Transformation was applied.")

Applying the transformation.
source_matrix.shape target_matrix.shape (2, 8719, 300) (2, 8719, 300)
Transformation was applied.


Finally, we re-evaluate the similarity of "chat" and "кот":

In [5]:
'''
en_vector = en_dictionary["chat"]
zh_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))
'''
#check out the similarity of country 
en_vector = en_dictionary["country"]
zh_vector = zh_dictionary["国"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

#check out the similarity of middle
en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["中"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

#check out the similarity of middle and a random unrelated word
en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["啥"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

0.415512686471
0.415020491828
0.369425876542


In [6]:
#Try to translate the word "learn"
en_vector = en_dictionary["learn"]
print(zh_dictionary.translate_nearest_neighbour(en_vector))

學習


ZH - EN translation seems to work for this word. Note that the dictionary contains traditional characters (as used in Taiwan and Hong Kong). The first two words are junk translations. Now, let's export this matrix and evaluate it externally.

In [7]:
zh_dictionary.export('transformed-matrix-facebookDict.txt')
print('Done.')

Done.


Now let's get our test dictionary for ZH - EN from fastText. For that we switch back into bash. <br>
> curl -Lo zh-en.5000-6500.txt https://s3.amazonaws.com/arrival/dictionaries/zh-en.5000-6500.txt

In [8]:
testf = 'zh-en.5000-6500.txt'
test_data = read_dict(testf)
#These are the source and target words
#source_words, target_words = zip(*train_data)
print(test_data[:10])
print(len(test_data)) 

[('反正', 'anyway'), ('繼承人', 'heir'), ('繼承人', 'heirs'), ('斤', 'catty'), ('導航', 'navigate'), ('導航', 'navigator'), ('導航', 'navigation'), ('導航', 'navigating'), ('鵰', 'eagle'), ('退化', 'degeneration')]
2483


After reading it in, let's start evaluating. Simply loop over all words in our test dictionary, compute the nearest neighbor and count how many have been translated right.<br>
This implementation is quite slow, I guess it could be speeded up with a better computation of the nearest neighbor search (think FAISS).

In [10]:
print('Starting evaluation')
#n = len(test_data) #vocabulary size
n = 0 #vocabulary size
prec1_cnt = 0 #how many words we got right so far
for c,e in test_data[:100]: #c is the Chinese word, e is the English word
    if e in en_dictionary:
        #print(e)
        n += 1
        en_vector = en_dictionary[e] #do we have a word vector for this word?
        translation = zh_dictionary.translate_nearest_neighbour(en_vector)
        if translation == c: #hooray, correct translation!
            prec1_cnt += 1
print('prec@1 ZH - EN: ', prec1_cnt / n)
print('Size of test vocabulary: ', n)

Starting evaluation
prec@1 ZH - EN:  0.2857142857142857
Size of test vocabulary:  91


Done! For speed reasons, we test here only with the first 100 entries, but you can of course compute the whole test set performance if you have time. It's pretty good for such a simple method. Keep in mind that we only used a vocabulary size of 50k in our word embeddings so some words that are in our test set probably weren't found in our test dictionary. 