Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Reading glove embeddings: strip() --> rstrip() (#1056)
Browse files Browse the repository at this point in the history
* changed to rstrip() for correct reading of glove files

* add test case for rstrip

* Add unicode space to vocab
  • Loading branch information
mandarjoshi90 authored and matt-gardner committed Apr 12, 2018
1 parent 0a918aa commit 4c02d92
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
2 changes: 1 addition & 1 deletion allennlp/modules/token_embedders/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, #
logger.info("Reading embeddings from file")
with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
for line in embeddings_file:
fields = line.decode('utf-8').strip().split(' ')
fields = line.decode('utf-8').rstrip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
Expand Down
5 changes: 5 additions & 0 deletions tests/modules/token_embedders/embedding_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,21 @@ def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
vocab.add_token_to_namespace(unicode_space)
embeddings_filename = self.TEST_DIR + "embeddings.gz"
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 3,
})
embedding_layer = Embedding.from_params(vocab, params)
word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))

Expand Down

0 comments on commit 4c02d92

Please sign in to comment.