# Using Pre-trained Word Embeddings

In [1]:
from mxnet import gluon
from mxnet import nd
from mxnet.gluon import text
from collections import Counter

## Creating Vocabulary with Word Embeddings

In [2]:
text.embedding.get_file_names('fasttext')[:5]

['crawl-300d-2M.vec',
 'wiki.aa.vec',
 'wiki.ab.vec',
 'wiki.ace.vec',
 'wiki.ady.vec']

In [3]:
data = " hello world \n hello nice world \n hi world \n"
counter = text.utils.count_tokens_from_str(data)

In [4]:
vocab = text.vocab.Vocabulary(counter)
fasttext_simple = text.embedding.create('fasttext', file_name='wiki.simple.vec')
vocab.set_embedding(fasttext_simple)

  'skipped.' % (line_num, token, elems))


In [5]:
len(vocab)

5

In [6]:
vocab.embedding['beautiful'].shape

(300,)

In [7]:
vocab.embedding['beautiful'][:5]


[ 0.  0.  0.  0.  0.]
<NDArray 5 @cpu(0)>

In [8]:
vocab.embedding['hello', 'world'].shape

(2, 300)

In [9]:
vocab.embedding['hello', 'world'][:, :5]


[[ 0.39567     0.21454    -0.035389   -0.24299    -0.095645  ]
 [ 0.10444    -0.10858     0.27212     0.13299    -0.33164999]]
<NDArray 2x5 @cpu(0)>

In [10]:
vocab['hello', 'world']

[2, 1]

In [11]:
input_dim, output_dim = vocab.embedding.idx_to_vec.shape
layer = gluon.nn.Embedding(input_dim, output_dim)
layer.initialize()
layer.weight.set_data(vocab.embedding.idx_to_vec)
layer(nd.array([2, 1]))[:, :5]


[[ 0.39567     0.21454    -0.035389   -0.24299    -0.095645  ]
 [ 0.10444    -0.10858     0.27212     0.13299    -0.33164999]]
<NDArray 2x5 @cpu(0)>

In [12]:
text.embedding.get_file_names('glove')[:5]

['glove.42B.300d.txt',
 'glove.6B.50d.txt',
 'glove.6B.100d.txt',
 'glove.6B.200d.txt',
 'glove.6B.300d.txt']

In [13]:
glove_6b50d = text.embedding.create('glove', file_name='glove.6B.50d.txt')

In [14]:
vocab = text.vocab.Vocabulary(Counter(glove_6b50d.idx_to_token))
vocab.set_embedding(glove_6b50d)

In [15]:
len(vocab.idx_to_token)

400001

In [16]:
print(vocab['beautiful'])
print(vocab.idx_to_token[71421])

71421
beautiful


## Applications of Word Embeddings

In [17]:
from mxnet import nd
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))

In [18]:
x = nd.array([1, 2])
y = nd.array([10, 20])
z = nd.array([-1, -2])

print(cos_sim(x, y))
print(cos_sim(x, z))


[ 1.]
<NDArray 1 @cpu(0)>

[-1.]
<NDArray 1 @cpu(0)>


### Word Similarity

In [19]:
def norm_vecs_by_row(x):
    return x / nd.sqrt(nd.sum(x * x, axis=1)).reshape((-1,1))

def get_knn(vocab, k, word):
    word_vec = vocab.embedding[word].reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_vec)
    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+2, ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    # Remove unknown and input tokens.
    return vocab.to_tokens(indices[2:])

In [20]:
get_knn(vocab, 5, 'baby')

['babies', 'boy', 'girl', 'newborn', 'pregnant']

In [21]:
cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])


[ 0.83871305]
<NDArray 1 @cpu(0)>

In [22]:
get_knn(vocab, 5, 'computers')

['computer', 'phones', 'pcs', 'machines', 'devices']

In [23]:
get_knn(vocab, 5, 'run')

['running', 'runs', 'went', 'start', 'ran']

In [24]:
get_knn(vocab, 5, 'beautiful')

['lovely', 'gorgeous', 'wonderful', 'charming', 'beauty']

### Word Analogy

In [25]:
def get_top_k_by_analogy(vocab, k, word1, word2, word3):
    word_vecs = vocab.embedding[word1, word2, word3]
    word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_diff)
    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]

    # Filter out unknown tokens.
    if vocab.to_tokens(indices[0]) == vocab.unknown_token:
        return vocab.to_tokens(indices[1:])
    else:
        return vocab.to_tokens(indices[:-1])

In [26]:
get_top_k_by_analogy(vocab, 1, 'man', 'woman', 'son')

['daughter']

In [27]:
def cos_sim_word_analogy(vocab, word1, word2, word3, word4):
    words = [word1, word2, word3, word4]
    vecs = vocab.embedding[words]
    return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])

cos_sim_word_analogy(vocab, 'man', 'woman', 'son', 'daughter')


[ 0.96583396]
<NDArray 1 @cpu(0)>

In [28]:
get_top_k_by_analogy(vocab, 1, 'beijing', 'china', 'tokyo')

['japan']

In [29]:
get_top_k_by_analogy(vocab, 1, 'bad', 'worst', 'big')

['biggest']

In [30]:
get_top_k_by_analogy(vocab, 1, 'do', 'did', 'go')

['went']