# Trying to replicate the code by running a ZH - EN bilingual induction task

Experiments with the fastText code, using a ZH - EN task with a combination of LDC and CEDICT ZH - EN dictionaries as training data.

In [31]:
import numpy as np
from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)
    print('source_matrix.shape target_matrix.shape', source_matrix.shape, target_matrix.shape)
    # perform the SVD
    product = np.matmul(source_matrix[0].transpose(), target_matrix[1])
    #There is one error in the ipnyp notebook -> one needs to slice the matrix, since it's a stack of matrices
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

The word vectors are the 50k top ones from pretrained fastText embeddings.

In [3]:
'''
fr_dictionary = FastVector(vector_file='wiki.fr.vec')
ru_dictionary = FastVector(vector_file='wiki.ru.vec')

fr_vector = fr_dictionary["chat"]
ru_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))
'''
zh_dictionary = FastVector(vector_file='wiki.zh.50k.vec')
en_dictionary = FastVector(vector_file='wiki.en.50k.vec')

en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["中"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

reading word vectors from ../Georgina/master-thesis/data/wiki.zh.50k.vec
reading word vectors from ../Georgina/master-thesis/data/wiki.en.50k.vec
-0.026612317462


This method does not work for ZH - EN because of the missing alphabet.

In [4]:
'''
en_words = set(en_dictionary.word2id.keys())
zh_words = set(zh_dictionary.word2id.keys())
overlap = list(en_words & zh_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]
print(bilingual_dictionary[:10]) 
'''

[('ala', 'ala'), ('ivan', 'ivan'), ('holden', 'holden'), ('agriculture', 'agriculture'), ('koda', 'koda'), ('isn', 'isn'), ('anderson', 'anderson'), ('trans', 'trans'), ('korean', 'korean'), ('see', 'see')]


In [17]:
import sys
#LDC and CEDICT shuffled data set
trainf = 'ldc-cedict-no-duplicates-shuffled-train-single-words.txt'
#Open the training data (bilingual dictionary)
def read_dict(dict_file):
    return [tuple(line.strip().split()) for line in open(dict_file)]
train_data = read_dict(trainf)
#These are the source and target words
#source_words, target_words = zip(*train_data)
print(train_data[:10])
print(len(train_data)) #This dictionary is 32012 lines long

[('乌拉尔', 'Ural'), ('卡尔顿', 'Carleton'), ('孜孜矻矻', 'diligently'), ('攻击性', 'aggressiveness'), ('加勒比', 'Caribbean'), ('蕴藏', 'contain'), ('流线型', 'sleek'), ('蒙茸', 'jumbled'), ('看上', 'favor'), ('爪子', 'paw')]
32012


Load the ZH - EN dict.

In [32]:
print("Applying the transformation.")
# form the training matrices
#source_matrix, target_matrix = make_training_matrices(
#    en_dictionary, zh_dictionary, bilingual_dictionary)

#I'm doing a ZH - EN translation task here
source_matrix = target_matrix = make_training_matrices(zh_dictionary, en_dictionary, train_data)
#print('source_matrix shape: ', type(source_matrix), np.ndim(source_matrix))
#print('target_matrix shape: ', type(target_matrix), np.ndim(target_matrix))
#print('source_dictionary: ', zh_dictionary.shape)
#print('target dictionary: ', en_dictionary.shape)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
zh_dictionary.apply_transform(transform)

print("Transformation was applied.")

Applying the transformation.
source_matrix.shape target_matrix.shape (2, 7906, 300) (2, 7906, 300)
Transformation was applied.


Finally, we re-evaluate the similarity of some words.

In [49]:
'''
en_vector = en_dictionary["chat"]
zh_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))
'''
#check out the similarity of country 
en_vector = en_dictionary["country"]
zh_vector = zh_dictionary["国"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

#check out the similarity of middle
en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["中"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

#check out the similarity of middle and a random unrelated word
en_vector = en_dictionary["middle"]
zh_vector = zh_dictionary["啥"]
print(FastVector.cosine_similarity(en_vector, zh_vector))

0.457366518916
0.430324895354
0.378405948261


In [48]:
#Try to translate the word "country"
en_vector = en_dictionary["country"]
print(zh_dictionary.translate_nearest_neighbour(en_vector))

国家


ZH - EN translation seems to work for this word.