Skip to content

Commit

Permalink
[#41] fix tokenize method performing the tokenization twice from PreT…
Browse files Browse the repository at this point in the history
…rainedTokenizer
  • Loading branch information
raymondng76 committed Dec 14, 2021
1 parent 30bfe8e commit 61c762a
Showing 1 changed file with 3 additions and 5 deletions.
8 changes: 3 additions & 5 deletions sgnlp/models/sentic_asgcn/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ def get_vocab(self):
return dict(self.vocab)

def _convert_token_to_id(self, token: str) -> int:
print("_convert_token_to_id")
return self.vocab.get(token, self.vocab.get(self.unk_token))

def _convert_id_to_token(self, index: int) -> str:
print("_convert_id_to_token")
return self.ids_to_tokens(index, self.unk_token)

@staticmethod
Expand Down Expand Up @@ -88,11 +90,7 @@ def _tokenize(self, text, **kwargs):
if self.do_lower_case:
text = text.lower()
words = text.split()
unknownidx = 1
sequence = [self.vocab[w] if w in self.vocab else unknownidx for w in words]
if len(sequence) == 0:
sequence = [0]
return sequence
return words

def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
Expand Down

0 comments on commit 61c762a

Please sign in to comment.