Replace current default stopword list with spaCy's. (#2940)

allenai · Jun 14, 2019 · c9eb2d0 · c9eb2d0
1 parent 6a3d3a8
commit c9eb2d0
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 27 deletions.
diff --git a/allennlp/data/tokenizers/word_filter.py b/allennlp/data/tokenizers/word_filter.py
@@ -2,6 +2,8 @@
 import re
 from overrides import overrides
 
+from spacy.lang.en.stop_words import STOP_WORDS
+
 from allennlp.common import Registrable
 from allennlp.data.tokenizers.token import Token
 from allennlp.common.file_utils import read_set_from_file
@@ -60,7 +62,7 @@ def filter_words(self, words: List[Token]) -> List[Token]:
 class StopwordFilter(WordFilter):
     """
     A ``StopwordFilter`` uses a list of stopwords to filter.
-    If no file is specified, a default list of stopwords is used.
+    If no file is specified, spaCy's default list of English stopwords is used.
     Words and stopwords are lowercased for comparison.
 
     Parameters
@@ -77,30 +79,7 @@ def __init__(self,
         if stopword_file is not None:
             self.stopwords = {token.lower() for token in read_set_from_file(stopword_file)}
         else:
-            self.stopwords = set([token.lower() for token in
-                                  ['I', 'a', 'aboard', 'about', 'above', 'accordance', 'according',
-                                   'across', 'after', 'against', 'along', 'alongside', 'also', 'am',
-                                   'amid', 'amidst', 'an', 'and', 'apart', 'are', 'around', 'as',
-                                   'aside', 'astride', 'at', 'atop', 'back', 'be', 'because', 'before',
-                                   'behind', 'below', 'beneath', 'beside', 'besides', 'between',
-                                   'beyond', 'but', 'by', 'concerning', 'do', 'down', 'due', 'during',
-                                   'either', 'except', 'exclusive', 'false', 'for', 'from', 'happen',
-                                   'he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'how',
-                                   'how many', 'how much', 'i', 'if', 'in', 'including', 'inside',
-                                   'instead', 'into', 'irrespective', 'is', 'it', 'its', 'itself',
-                                   'less', 'me', 'mine', 'minus', 'my', 'myself', 'neither', 'next',
-                                   'not', 'occur', 'of', 'off', 'on', 'onto', 'opposite', 'or', 'our',
-                                   'ours', 'ourselves', 'out', 'out of', 'outside', 'over', 'owing',
-                                   'per', 'prepatory', 'previous', 'prior', 'pursuant', 'regarding',
-                                   's', 'sans', 'she', 'subsequent', 'such', 'than', 'thanks', 'that',
-                                   'the', 'their', 'theirs', 'them', 'themselves', 'then', 'these',
-                                   'they', 'this', 'those', 'through', 'throughout', 'thru', 'till',
-                                   'to', 'together', 'top', 'toward', 'towards', 'true', 'under',
-                                   'underneath', 'unlike', 'until', 'up', 'upon', 'us', 'using',
-                                   'versus', 'via', 'was', 'we', 'were', 'what', 'when', 'where',
-                                   'which', 'who', 'why', 'will', 'with', 'within', 'without', 'you',
-                                   'your', 'yours', 'yourself', 'yourselves', ",", '.', ':', '!', ';',
-                                   "'", '"', '&', '$', '#', '@', '(', ')', '?']])
+            self.stopwords = STOP_WORDS
         for token in self._tokens_to_add:
             self.stopwords.add(token.lower())
 

diff --git a/allennlp/tests/data/tokenizers/word_tokenizer_test.py b/allennlp/tests/data/tokenizers/word_tokenizer_test.py
@@ -30,7 +30,7 @@ def test_batch_tokenization(self):
     def test_stems_and_filters_stopwords_correctly(self):
         tokenizer = WordTokenizer.from_params(Params({'word_stemmer': {'type': 'porter'},
                                                       'word_filter': {'type': 'stopwords'}}))
-        sentence = "this (sentence) has 'crazy' \"punctuation\"."
-        expected_tokens = ["sentenc", "ha", "crazi", "punctuat"]
+        sentence = "this sentence has some stopwords, (but it doesn't have crazy \"punctuation\")."
+        expected_tokens = ["sentenc", "stopword", ",", "(", "crazi", '"', "punctuat", '"', ")", "."]
         tokens = [t.text for t in tokenizer.tokenize(sentence)]
         assert tokens == expected_tokens