apache · rzo1 · May 19, 2026 · May 21, 2026 · May 21, 2026
diff --git a/LICENSE b/LICENSE
@@ -230,6 +230,41 @@ The following license applies to the Snowball stemmers:
 	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+The following license applies to the bundled stopword lists in
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/stopword.
+These lists are derived from Apache Lucene, which redistributes them from
+the Snowball project; the Bulgarian list (bg.txt) was created by Jacques
+Savoy (http://members.unine.ch/jacques.savoy/clef/index.html). They are
+distributed under the BSD license:
+
+	Copyright (c) 2001, Dr Martin Porter
+	Copyright (c) 2002, Richard Boulton
+	Copyright (c) Jacques Savoy
+	All rights reserved.
+
+	Redistribution and use in source and binary forms, with or without
+	modification, are permitted provided that the following conditions are met:
+
+	    * Redistributions of source code must retain the above copyright notice,
+	    * this list of conditions and the following disclaimer.
+	    * Redistributions in binary form must reproduce the above copyright
+	    * notice, this list of conditions and the following disclaimer in the
+	    * documentation and/or other materials provided with the distribution.
+	    * Neither the name of the copyright holders nor the names of its contributors
+	    * may be used to endorse or promote products derived from this software
+	    * without specific prior written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+	FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+	DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+	SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+	CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 The following license applies to the Wordpiece tokenizer implementation:
 
     The MIT License (MIT)

diff --git a/NOTICE b/NOTICE
@@ -14,6 +14,19 @@ http://snowball.tartarus.org/
 
 ============================================================================
 
+The bundled stopword lists in
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/stopword
+are derived from Apache Lucene
+(https://github.com/apache/lucene/tree/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis),
+which in turn distributes them under the BSD license from the Snowball project
+(https://snowballstem.org/license.html). The Bulgarian list (bg.txt) is the
+Lucene per-language Bulgarian stopwords file originally created by Jacques
+Savoy (http://members.unine.ch/jacques.savoy/clef/index.html) and also
+distributed under the BSD license. The original upstream license and
+attribution headers are preserved verbatim at the top of each bundled file.
+
+============================================================================
+
 The Wordpiece tokenizer in opennlp-tools/main/java/opennlp/tools/tokenize
 is taken from https://github.com/robrua/easy-bert licensed under
 

diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ The Apache OpenNLP library is a machine learning based toolkit for the processin
 
 This toolkit is written completely in Java and provides support for common NLP tasks, such as tokenization,
  sentence segmentation, part-of-speech tagging, named entity extraction, chunking, parsing,
-  coreference resolution, language detection and more!
+  coreference resolution, language detection, stopword filtering (with bundled lists for 11 languages) and more!
 
 These tasks are usually required to build more advanced text processing services.
 

diff --git a/opennlp-api/src/main/java/opennlp/tools/stopword/StopwordFilter.java b/opennlp-api/src/main/java/opennlp/tools/stopword/StopwordFilter.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.stopword;
+
+import java.util.Set;
+
+/**
+ * A pluggable filter that decides whether a token (or a sequence of tokens)
+ * is a stopword that should be removed during downstream text processing.
+ * <p>
+ * Implementations may be backed by a static bundled list, a user-supplied
+ * file, an in-memory data structure, or any other source.
+ * Both single-token and multi-token (n-gram) membership tests are supported.
+ *
+ * @see opennlp.tools.util.LanguageCodeValidator
+ */
+public interface StopwordFilter {
+
+  /**
+   * Checks whether the given token is a single-token stopword.
+   * Equivalent to {@code isStopword(new String[] { token.toString() })} when
+   * {@code token} is non-{@code null}.
+   *
+   * @param token The token to test. May be {@code null}, in which case
+   *     implementations should return {@code false}.
+   * @return {@code true} if {@code token} is registered as a single-token
+   *     stopword, {@code false} otherwise.
+   */
+  boolean isStopword(final CharSequence token);
+
+  /**
+   * Checks whether the given sequence of tokens is a multi-token stopword
+   * (n-gram). For a single token this is equivalent to
+   * {@link #isStopword(CharSequence)}.
+   *
+   * @param tokens The tokens to test as one entry. May be {@code null} or
+   *     empty, in which case implementations should return {@code false}.
+   * @return {@code true} if the sequence is registered as a stopword,
+   *     {@code false} otherwise.
+   */
+  boolean isStopword(final String... tokens);
+
+  /**
+   * Returns a copy of {@code tokens} with stopword matches removed,
+   * preserving the input order.
+   * <p>
+   * Implementations should honor both 1-gram and n-gram entries. A
+   * recommended strategy is a greedy left-to-right window scan: at each
+   * position try the longest registered window first; if it matches, skip
+   * those tokens; otherwise advance by one and keep the current token.
+   * Implementations that do not support n-gram entries may fall back to
+   * 1-gram filtering.
+   *
+   * @param tokens The input token array. Must not be {@code null}.
+   *     Individual array elements may be {@code null} and are kept as-is.
+   * @return A new array containing the surviving tokens. Never {@code null}.
+   * @throws IllegalArgumentException if {@code tokens} is {@code null}.
+   */
+  String[] filter(final String[] tokens);
+
+  /**
+   * @return {@code true} if this filter performs case-sensitive matching;
+   *     {@code false} if matching is case-insensitive.
+   */
+  boolean isCaseSensitive();
+
+  /**
+   * Returns an unmodifiable snapshot of the registered single-token
+   * stopwords. Multi-token (n-gram) entries are not included in this view
+   * and must be tested via {@link #isStopword(String...)}.
+   * <p>
+   * Attempts to mutate the returned {@link Set} will fail.
+   *
+   * @return An unmodifiable {@link Set} of stopwords. Never {@code null}.
+   * @throws UnsupportedOperationException if a caller attempts to add to,
+   *     remove from, or otherwise mutate the returned {@link Set}.
+   */
+  Set<String> stopwords();
+}
diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -77,6 +77,7 @@
 import opennlp.tools.cmdline.sentiment.SentimentCrossValidatorTool;
 import opennlp.tools.cmdline.sentiment.SentimentEvaluatorTool;
 import opennlp.tools.cmdline.sentiment.SentimentTrainerTool;
+import opennlp.tools.cmdline.stopword.StopwordFilterTool;
 import opennlp.tools.cmdline.tokenizer.DictionaryDetokenizerTool;
 import opennlp.tools.cmdline.tokenizer.SimpleTokenizerTool;
 import opennlp.tools.cmdline.tokenizer.TokenizerConverterTool;
@@ -130,6 +131,9 @@ public final class CLI {
     tools.add(new TokenizerConverterTool());
     tools.add(new DictionaryDetokenizerTool());
 
+    // Stopword filter
+    tools.add(new StopwordFilterTool());
+
     // Sentence detector
     tools.add(new SentenceDetectorTool());
     tools.add(new SentenceDetectorTrainerTool());

diff --git a/...nlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/stopword/StopwordFilterTool.java b/...nlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/stopword/StopwordFilterTool.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.stopword;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.InvalidPathException;
+import java.nio.file.NoSuchFileException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.stopword.StopwordFilter;
+import opennlp.tools.stopword.StopwordLists;
+
+/**
+ * A command line tool that filters stop words from whitespace-separated
+ * tokens read on standard input and prints the kept tokens to standard
+ * output, one input line per output line.
+ *
+ * <p>Usage: {@code opennlp StopwordFilter <lang|file>}. The single argument is
+ * either an ISO 639 language code matching one of the bundled lists, or a path
+ * to a custom stopword list file (one entry per line, {@code #} comments and
+ * blank lines ignored, loaded case-insensitively). The tokens to filter are
+ * always read from standard input. A bundled language code takes precedence;
+ * to force loading a file whose name happens to be a language code, qualify it
+ * with a path (e.g. {@code ./en}).
+ */
+public final class StopwordFilterTool extends BasicCmdLineTool {
+
+  @Override
+  public String getShortDescription() {
+    return "filters stop words from tokens read on stdin";
+  }
+
+  @Override
+  public String getHelp() {
+    return "Usage: " + CLI.CMD + " " + getName() + " <lang|file>\n"
+        + "  <lang> ISO 639 code of a bundled list; supported: "
+        + StopwordLists.supportedLanguages() + "\n"
+        + "  <file> path to a custom stopword list (one entry per line; "
+        + "'#' comments and blank lines ignored)";
+  }
+
+  @Override
+  public boolean hasParams() {
+    return true;
+  }
+
+  @Override
+  public void run(final String[] args) {
+    if (args.length != 1) {
+      System.out.println(getHelp());
+      return;
+    }
+
+    final StopwordFilter filter = resolveFilter(args[0]);
+
+    try (BufferedReader reader = new BufferedReader(
+        new InputStreamReader(System.in, StandardCharsets.UTF_8));
+         PrintWriter writer = new PrintWriter(
+             new java.io.OutputStreamWriter(System.out, StandardCharsets.UTF_8))) {
+
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (line.isEmpty()) {
+          writer.println();
+          continue;
+        }
+        final String[] tokens = line.split("\\s+");
+        final String[] kept = filter.filter(tokens);
+        writer.println(String.join(" ", kept));
+      }
+
+      writer.flush();
+    } catch (final IOException e) {
+      throw new TerminateToolException(1, "Error reading from stdin: " + e.getMessage(), e);
+    }
+  }
+
+  /**
+   * Resolves the {@code <lang|file>} argument to a {@link StopwordFilter}. A
+   * bundled language code is preferred; otherwise the argument is treated as a
+   * path to a custom stopword list file loaded via
+   * {@link StopwordLists#load(InputStream, java.nio.charset.Charset, boolean)}.
+   */
+  private static StopwordFilter resolveFilter(final String source) {
+    final StopwordFilter bundled = tryBundled(source);
+    if (bundled != null) {
+      return bundled;
+    }
+
+    final Path path;
+    try {
+      path = Paths.get(source);
+    } catch (final InvalidPathException e) {
+      throw new TerminateToolException(1, neitherMessage(source));
+    }
+
+    try (InputStream in = Files.newInputStream(path)) {
+      return StopwordLists.load(in, StandardCharsets.UTF_8, false);
+    } catch (final NoSuchFileException e) {
+      throw new TerminateToolException(1, neitherMessage(source));
+    } catch (final IOException e) {
+      throw new TerminateToolException(1,
+          "Error reading stopword list file '" + source + "': " + e.getMessage(), e);
+    }
+  }
+
+  /**
+   * @return A bundled {@link StopwordFilter} for {@code code}, or {@code null}
+   *     if {@code code} is not a supported bundled ISO 639 language code.
+   */
+  private static StopwordFilter tryBundled(final String code) {
+    try {
+      return StopwordLists.forLanguage(code);
+    } catch (final IllegalArgumentException e) {
+      return null;
+    }
+  }
+
+  private static String neitherMessage(final String source) {
+    return "'" + source + "' is neither a supported language code "
+        + StopwordLists.supportedLanguages() + " nor an existing file.";
+  }
+}