Skip to content
Branch: master
Find file Copy path
Find file Copy path
3 contributors

Users who have contributed to this file

@matt-gardner @joelgrus @DeNeutoy
77 lines (67 sloc) 3.71 KB
from typing import List
from overrides import overrides
from import Token
from import Tokenizer
from import WordFilter, PassThroughWordFilter
from import WordSplitter, SpacyWordSplitter
from import WordStemmer, PassThroughWordStemmer
class WordTokenizer(Tokenizer):
A ``WordTokenizer`` handles the splitting of strings into words as well as any desired
post-processing (e.g., stemming, filtering, etc.). Note that we leave one particular piece of
post-processing for later: the decision of whether or not to lowercase the token. This is for
two reasons: (1) if you want to make two different casing decisions for whatever reason, you
won't have to run the tokenizer twice, and more importantly (2) if you want to lowercase words
for your word embedding, but retain capitalization in a character-level representation, we need
to retain the capitalization here.
word_splitter : ``WordSplitter``, optional
The :class:`WordSplitter` to use for splitting text strings into word tokens. The default
is to use the ``SpacyWordSplitter`` with default parameters.
word_filter : ``WordFilter``, optional
The :class:`WordFilter` to use for, e.g., removing stopwords. Default is to do no
word_stemmer : ``WordStemmer``, optional
The :class:`WordStemmer` to use. Default is no stemming.
start_tokens : ``List[str]``, optional
If given, these tokens will be added to the beginning of every string we tokenize.
end_tokens : ``List[str]``, optional
If given, these tokens will be added to the end of every string we tokenize.
def __init__(self,
word_splitter: WordSplitter = None,
word_filter: WordFilter = PassThroughWordFilter(),
word_stemmer: WordStemmer = PassThroughWordStemmer(),
start_tokens: List[str] = None,
end_tokens: List[str] = None) -> None:
self._word_splitter = word_splitter or SpacyWordSplitter()
self._word_filter = word_filter
self._word_stemmer = word_stemmer
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._end_tokens = end_tokens or []
def tokenize(self, text: str) -> List[Token]:
Does whatever processing is required to convert a string of text into a sequence of tokens.
At a minimum, this uses a ``WordSplitter`` to split words into text. It may also do
stemming or stopword removal, depending on the parameters given to the constructor.
words = self._word_splitter.split_words(text)
return self._filter_and_stem(words)
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
batched_words = self._word_splitter.batch_split_words(texts)
return [self._filter_and_stem(words) for words in batched_words]
def _filter_and_stem(self, words: List[Token]) -> List[Token]:
filtered_words = self._word_filter.filter_words(words)
stemmed_words = [self._word_stemmer.stem_word(word) for word in filtered_words]
for start_token in self._start_tokens:
stemmed_words.insert(0, Token(start_token, 0))
for end_token in self._end_tokens:
stemmed_words.append(Token(end_token, -1))
return stemmed_words
You can’t perform that action at this time.