[#41] fix tokenize method performing the tokenization twice from PreT…

…rainedTokenizer
aisingapore · Dec 14, 2021 · 61c762a · 61c762a
1 parent 30bfe8e
commit 61c762a
Showing 1 changed file with 3 additions and 5 deletions.
diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py
@@ -38,9 +38,11 @@ def get_vocab(self):
         return dict(self.vocab)
 
     def _convert_token_to_id(self, token: str) -> int:
+        print("_convert_token_to_id")
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index: int) -> str:
+        print("_convert_id_to_token")
         return self.ids_to_tokens(index, self.unk_token)
 
     @staticmethod
@@ -88,11 +90,7 @@ def _tokenize(self, text, **kwargs):
         if self.do_lower_case:
             text = text.lower()
         words = text.split()
-        unknownidx = 1
-        sequence = [self.vocab[w] if w in self.vocab else unknownidx for w in words]
-        if len(sequence) == 0:
-            sequence = [0]
-        return sequence
+        return words
 
     def save_vocabulary(
         self, save_directory: str, filename_prefix: Optional[str] = None