Reading glove embeddings: strip() --> rstrip() (#1056)

* changed to rstrip() for correct reading of glove files * add test case for rstrip * Add unicode space to vocab
allenai · Apr 12, 2018 · 4c02d92 · 4c02d92
1 parent 0a918aa
commit 4c02d92
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
@@ -231,7 +231,7 @@ def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, #
     logger.info("Reading embeddings from file")
     with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
         for line in embeddings_file:
-            fields = line.decode('utf-8').strip().split(' ')
+            fields = line.decode('utf-8').rstrip().split(' ')
             if len(fields) - 1 != embedding_dim:
                 # Sometimes there are funny unicode parsing problems that lead to different
                 # fields lengths (e.g., a word with a unicode space character that splits

diff --git a/tests/modules/token_embedders/embedding_test.py b/tests/modules/token_embedders/embedding_test.py
@@ -51,16 +51,21 @@ def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
         vocab = Vocabulary()
         vocab.add_token_to_namespace("word")
         vocab.add_token_to_namespace("word2")
+        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+        vocab.add_token_to_namespace(unicode_space)
         embeddings_filename = self.TEST_DIR + "embeddings.gz"
         with gzip.open(embeddings_filename, 'wb') as embeddings_file:
             embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
+            embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
         params = Params({
                 'pretrained_file': embeddings_filename,
                 'embedding_dim': 3,
                 })
         embedding_layer = Embedding.from_params(vocab, params)
         word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
         assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
+        word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
+        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
         word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
         assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))