bigdata-ustc · tswsxk · Nov 15, 2021 · Nov 13, 2021 · Nov 13, 2021 · tswsxk
diff --git a/CHANGE.txt b/CHANGE.txt
@@ -1,3 +1,12 @@
+v0.0.7:
+    1. add BERT and pretrained model (luna_bert)
+    2. speed up the process in sif
+    3. handling OOV in word2vec
+    4. add English tutorials
+    5. add api docs and prettify tutorials
+    6. fix the np.error in gensim_vec.W2V.infer_vector
+    7. fix the parameters lost in tokenization
+
 v0.0.6:
     1. dev: add half-pretrained rnn model
     2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours).

diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py
@@ -62,7 +62,8 @@ def __call__(self, *words):
             yield self[word]
 
     def __getitem__(self, item):
-        return self.wv[item] if item not in self.constants else np.zeros((self.vector_size,))
+        index = self.key_to_index(item)
+        return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,))
 
     def infer_vector(self, items, agg="mean", *args, **kwargs) -> np.ndarray:
         token_vectors = self.infer_tokens(items, *args, **kwargs)

diff --git a/tests/test_vec/test_vec.py b/tests/test_vec/test_vec.py
@@ -86,6 +86,7 @@ def test_w2v(stem_tokens, tmpdir, method, binary):
     assert w2v.vectors.shape == (len(w2v.wv.vectors) + len(w2v.constants), w2v.vector_size)
     assert w2v.key_to_index("[UNK]") == 0
     assert w2v.key_to_index("OOV") == 0
+    assert np.array_equal(w2v["OOV"], np.zeros((10,)))
 
     t2v = T2V("w2v", filepath=filepath, method=method, binary=binary)
     assert len(t2v(stem_tokens[:1])[0]) == t2v.vector_size