Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Replace current default stopword list with spaCy's. (#2940)
Browse files Browse the repository at this point in the history
  • Loading branch information
diatkinson authored and matt-gardner committed Jun 14, 2019
1 parent 6a3d3a8 commit c9eb2d0
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 27 deletions.
29 changes: 4 additions & 25 deletions allennlp/data/tokenizers/word_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
from overrides import overrides

from spacy.lang.en.stop_words import STOP_WORDS

from allennlp.common import Registrable
from allennlp.data.tokenizers.token import Token
from allennlp.common.file_utils import read_set_from_file
Expand Down Expand Up @@ -60,7 +62,7 @@ def filter_words(self, words: List[Token]) -> List[Token]:
class StopwordFilter(WordFilter):
"""
A ``StopwordFilter`` uses a list of stopwords to filter.
If no file is specified, a default list of stopwords is used.
If no file is specified, spaCy's default list of English stopwords is used.
Words and stopwords are lowercased for comparison.
Parameters
Expand All @@ -77,30 +79,7 @@ def __init__(self,
if stopword_file is not None:
self.stopwords = {token.lower() for token in read_set_from_file(stopword_file)}
else:
self.stopwords = set([token.lower() for token in
['I', 'a', 'aboard', 'about', 'above', 'accordance', 'according',
'across', 'after', 'against', 'along', 'alongside', 'also', 'am',
'amid', 'amidst', 'an', 'and', 'apart', 'are', 'around', 'as',
'aside', 'astride', 'at', 'atop', 'back', 'be', 'because', 'before',
'behind', 'below', 'beneath', 'beside', 'besides', 'between',
'beyond', 'but', 'by', 'concerning', 'do', 'down', 'due', 'during',
'either', 'except', 'exclusive', 'false', 'for', 'from', 'happen',
'he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'how',
'how many', 'how much', 'i', 'if', 'in', 'including', 'inside',
'instead', 'into', 'irrespective', 'is', 'it', 'its', 'itself',
'less', 'me', 'mine', 'minus', 'my', 'myself', 'neither', 'next',
'not', 'occur', 'of', 'off', 'on', 'onto', 'opposite', 'or', 'our',
'ours', 'ourselves', 'out', 'out of', 'outside', 'over', 'owing',
'per', 'prepatory', 'previous', 'prior', 'pursuant', 'regarding',
's', 'sans', 'she', 'subsequent', 'such', 'than', 'thanks', 'that',
'the', 'their', 'theirs', 'them', 'themselves', 'then', 'these',
'they', 'this', 'those', 'through', 'throughout', 'thru', 'till',
'to', 'together', 'top', 'toward', 'towards', 'true', 'under',
'underneath', 'unlike', 'until', 'up', 'upon', 'us', 'using',
'versus', 'via', 'was', 'we', 'were', 'what', 'when', 'where',
'which', 'who', 'why', 'will', 'with', 'within', 'without', 'you',
'your', 'yours', 'yourself', 'yourselves', ",", '.', ':', '!', ';',
"'", '"', '&', '$', '#', '@', '(', ')', '?']])
self.stopwords = STOP_WORDS
for token in self._tokens_to_add:
self.stopwords.add(token.lower())

Expand Down
4 changes: 2 additions & 2 deletions allennlp/tests/data/tokenizers/word_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_batch_tokenization(self):
def test_stems_and_filters_stopwords_correctly(self):
tokenizer = WordTokenizer.from_params(Params({'word_stemmer': {'type': 'porter'},
'word_filter': {'type': 'stopwords'}}))
sentence = "this (sentence) has 'crazy' \"punctuation\"."
expected_tokens = ["sentenc", "ha", "crazi", "punctuat"]
sentence = "this sentence has some stopwords, (but it doesn't have crazy \"punctuation\")."
expected_tokens = ["sentenc", "stopword", ",", "(", "crazi", '"', "punctuat", '"', ")", "."]
tokens = [t.text for t in tokenizer.tokenize(sentence)]
assert tokens == expected_tokens

0 comments on commit c9eb2d0

Please sign in to comment.