diff --git a/CHANGE.txt b/CHANGE.txt index cd98345e..1a47033f 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,3 +1,12 @@ +v0.0.7: + 1. add BERT and pretrained model (luna_bert) + 2. speed up the process in sif + 3. handling OOV in word2vec + 4. add English tutorials + 5. add api docs and prettify tutorials + 6. fix the np.error in gensim_vec.W2V.infer_vector + 7. fix the parameters lost in tokenization + v0.0.6: 1. dev: add half-pretrained rnn model 2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours). diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index 3ade74a5..ece25442 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -62,7 +62,8 @@ def __call__(self, *words): yield self[word] def __getitem__(self, item): - return self.wv[item] if item not in self.constants else np.zeros((self.vector_size,)) + index = self.key_to_index(item) + return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,)) def infer_vector(self, items, agg="mean", *args, **kwargs) -> np.ndarray: token_vectors = self.infer_tokens(items, *args, **kwargs) diff --git a/tests/test_vec/test_vec.py b/tests/test_vec/test_vec.py index bc7194d9..d6ae1a44 100644 --- a/tests/test_vec/test_vec.py +++ b/tests/test_vec/test_vec.py @@ -86,6 +86,7 @@ def test_w2v(stem_tokens, tmpdir, method, binary): assert w2v.vectors.shape == (len(w2v.wv.vectors) + len(w2v.constants), w2v.vector_size) assert w2v.key_to_index("[UNK]") == 0 assert w2v.key_to_index("OOV") == 0 + assert np.array_equal(w2v["OOV"], np.zeros((10,))) t2v = T2V("w2v", filepath=filepath, method=method, binary=binary) assert len(t2v(stem_tokens[:1])[0]) == t2v.vector_size