apache · epugh · Sep 18, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
@@ -21,6 +21,8 @@ New Features
 
 * SOLR-17780: Add support for scalar quantized dense vectors (Kevin Liang via Alessandro Benedetti)
 
+* SOLR-17023: Use Modern NLP Models from Apache OpenNLP with Solr (Jeff Zemerick, Eric Pugh)
+
 Improvements
 ---------------------
 

diff --git a/solr/modules/analysis-extras/README.md b/solr/modules/analysis-extras/README.md
@@ -24,7 +24,8 @@ upon large dependencies/dictionaries.
 It includes integration with ICU for multilingual support,
 analyzers for Chinese and Polish, and integration with
 OpenNLP for multilingual tokenization, part-of-speech tagging
-lemmatization, phrase chunking, and named-entity recognition.
+lemmatization, phrase chunking, and named-entity recognition
+including being able to run models sourced from Huggingface.
 
 Each of the jars below relies upon including `/modules/analysis-extras/lib/solr-analysis-extras-X.Y.Z.jar`
 in the `solrconfig.xml`

diff --git a/.../src/java/org/apache/solr/update/processor/DocumentCategorizerUpdateProcessorFactory.java b/.../src/java/org/apache/solr/update/processor/DocumentCategorizerUpdateProcessorFactory.java
@@ -54,6 +54,55 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+/**
+ * Classifies text in fields using a model via OpenNLP <code>modelFile</code> from the values found
+ * in any matching <code>source</code> field into a configured <code>dest</code> field.
+ *
+ * <p>See the <a
+ * href="https://solr.apache.org/guide/solr/latest/getting-started/tutorial-opennlp.html">Tutorial</a>
+ * for the step by step guide.
+ *
+ * <p>The <code>source</code> field(s) can be configured as either:
+ *
+ * <ul>
+ *   <li>One or more <code>&lt;str&gt;</code>
+ *   <li>An <code>&lt;arr&gt;</code> of <code>&lt;str&gt;</code>
+ *   <li>A <code>&lt;lst&gt;</code> containing {@link FieldMutatingUpdateProcessor
+ *       FieldMutatingUpdateProcessorFactory style selector arguments}
+ * </ul>
+ *
+ * <p>The <code>dest</code> field can be a single <code>&lt;str&gt;</code> containing the literal
+ * name of a destination field, or it may be a <code>&lt;lst&gt;</code> specifying a regex <code>
+ * pattern</code> and a <code>replacement</code> string. If the pattern + replacement option is used
+ * the pattern will be matched against all fields matched by the source selector, and the
+ * replacement string (including any capture groups specified from the pattern) will be evaluated a
+ * using {@link Matcher#replaceAll(String)} to generate the literal name of the destination field.
+ *
+ * <p>If the resolved <code>dest</code> field already exists in the document, then the named
+ * entities extracted from the <code>source</code> fields will be added to it.
+ *
+ * <p>In the example below:
+ *
+ * <ul>
+ *   <li>Classification will be performed on the <code>text</code> field and added to the <code>
+ *       text_sentiment</code> field
+ * </ul>
+ *
+ * <pre class="prettyprint">
+ * &lt;updateRequestProcessorChain name="sentimentClassifier"&gt;
+ *   &lt;processor class="solr.processor.DocumentCategorizerUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;models/sentiment/model.onnx&lt;/str&gt;
+ *     &lt;str name="vocabFile"&gt;models/sentiment/vocab.txt&lt;/str&gt;
+ *     &lt;str name="source"&gt;text&lt;/str&gt;
+ *     &lt;str name="dest"&gt;text_sentiment&lt;/str&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.LogUpdateProcessorFactory" /&gt;
+ *   &lt;processor class="solr.RunUpdateProcessorFactory" /&gt;
+ * &lt;/updateRequestProcessorChain&gt;
+ * </pre>
+ *
+ * @since 10.0.0
+ */
 public class DocumentCategorizerUpdateProcessorFactory extends UpdateRequestProcessorFactory
     implements SolrCoreAware {
 
@@ -69,16 +118,15 @@ public class DocumentCategorizerUpdateProcessorFactory extends UpdateRequestProc
   private Path solrHome;
 
   private SelectorParams srcInclusions = new SelectorParams();
-  private Collection<SelectorParams> srcExclusions = new ArrayList<>();
+  private final Collection<SelectorParams> srcExclusions = new ArrayList<>();
 
   private FieldNameSelector srcSelector = null;
 
   private String model = null;
   private String vocab = null;
-  private String analyzerFieldType = null;
 
   /**
-   * If pattern is null, this this is a literal field name. If pattern is non-null then this is a
+   * If pattern is null, then this is a literal field name. If pattern is non-null then this is a
    * replacement string that may contain meta-characters (ie: capture group identifiers)
    *
    * @see #pattern
@@ -277,11 +325,10 @@ private void initSourceSelectorSyntax(NamedList<?> args) {
             throw new SolrException(
                 SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' can not be null");
           }
-          if (!(excObj instanceof NamedList)) {
+          if (!(excObj instanceof NamedList<?> exc)) {
             throw new SolrException(
                 SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' must be <lst/>");
           }
-          NamedList<?> exc = (NamedList<?>) excObj;
           srcExclusions.add(parseSelectorParams(exc));
           if (0 < exc.size()) {
             throw new SolrException(
@@ -328,8 +375,7 @@ private void initSourceSelectorSyntax(NamedList<?> args) {
               + "for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
     }
 
-    if (d instanceof NamedList) {
-      NamedList<?> destList = (NamedList<?>) d;
+    if (d instanceof NamedList<?> destList) {
 
       Object patt = destList.remove(PATTERN_PARAM);
       Object replacement = destList.remove(REPLACEMENT_PARAM);
@@ -450,9 +496,7 @@ public final UpdateRequestProcessor getInstance(
                   getCategories(),
                   new AverageClassificationScoringStrategy(),
                   new InferenceOptions());
-        } catch (IOException e) {
-          log.warn("Attempted to initialize documentCategorizerDL", e);
-        } catch (OrtException e) {
+        } catch (IOException | OrtException e) {
           log.warn("Attempted to initialize documentCategorizerDL", e);
         }
       }
@@ -490,31 +534,28 @@ public void processAdd(AddUpdateCommand cmd) throws IOException {
 
           for (Object val : srcFieldValues) {
             for (Pair<String, String> entity : classify(val)) {
-              SolrInputField destField = null;
+              SolrInputField destField;
               // String classification = entity.first();
               String classificationValue = entity.second();
-              final String resolved = resolvedDest;
-              if (doc.containsKey(resolved)) {
-                destField = doc.getField(resolved);
+              if (doc.containsKey(resolvedDest)) {
+                destField = doc.getField(resolvedDest);
               } else {
-                SolrInputField targetField = destMap.get(resolved);
+                SolrInputField targetField = destMap.get(resolvedDest);
                 if (targetField == null) {
-                  destField = new SolrInputField(resolved);
+                  destField = new SolrInputField(resolvedDest);
                 } else {
                   destField = targetField;
                 }
               }
               destField.addValue(classificationValue);
 
               // put it in map to avoid concurrent modification...
-              destMap.put(resolved, destField);
+              destMap.put(resolvedDest, destField);
             }
           }
         }
 
-        for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
-          doc.put(entry.getKey(), entry.getValue());
-        }
+        doc.putAll(destMap);
         super.processAdd(cmd);
       }
 

diff --git a/...a/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java b/...a/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
@@ -28,6 +28,7 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
@@ -74,7 +75,7 @@
  * </pre>
  *
  * <p>See the <a href="https://opennlp.apache.org/models.html">OpenNLP website</a> for information
- * on downloading pre-trained models. Note that in order to use model files larger than 1MB on
+ * on downloading pre-trained models. Note that in order to use model files larger than 1 MB on
  * SolrCloud, <a
  * href="https://solr.apache.org/guide/solr/latest/deployment-guide/zookeeper-ensemble.html#increasing-the-file-size-limit">ZooKeeper
  * server and client configuration is required</a>.
@@ -186,15 +187,15 @@ public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory extends UpdateReq
   public static final String ENTITY_TYPE = "{EntityType}";
 
   private SelectorParams srcInclusions = new SelectorParams();
-  private Collection<SelectorParams> srcExclusions = new ArrayList<>();
+  private final Collection<SelectorParams> srcExclusions = new ArrayList<>();
 
   private FieldNameSelector srcSelector = null;
 
   private String modelFile = null;
   private String analyzerFieldType = null;
 
   /**
-   * If pattern is null, this this is a literal field name. If pattern is non-null then this is a
+   * If pattern is null, then this is a literal field name. If pattern is non-null then this is a
    * replacement string that may contain meta-characters (ie: capture group identifiers)
    *
    * @see #pattern
@@ -358,9 +359,8 @@ private void initSourceSelectorSyntax(NamedList<?> args) {
     //
     //   source != null && dest != null
 
-    // if we got here we know we had source and dest, now check for the other two so that we can
-    // give a better
-    // message than "unexpected"
+    // if we got here we know we have source and dest, now check for the other two so that we can
+    // give a better message than "unexpected"
     if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
       throw new SolrException(
           SERVER_ERROR,
@@ -419,7 +419,7 @@ private void initSourceSelectorSyntax(NamedList<?> args) {
                   + "' contains unexpected child param(s): "
                   + selectorConfig);
         }
-        // consume from the named list so it doesn't interfere with subsequent processing
+        // consume from the named list, so it doesn't interfere with subsequent processing
         sources.remove(0);
       }
     }
@@ -537,7 +537,7 @@ public final UpdateRequestProcessor getInstance(
     final FieldNameSelector srcSelector = getSourceSelector();
     return new UpdateRequestProcessor(next) {
       private final NLPNERTaggerOp nerTaggerOp;
-      private Analyzer analyzer = null;
+      private final Analyzer analyzer;
 
       {
         try {
@@ -590,19 +590,16 @@ public void processAdd(AddUpdateCommand cmd) throws IOException {
 
           for (Object val : srcFieldValues) {
             for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
-              SolrInputField destField = null;
+              SolrInputField destField;
               String entityName = entity.first();
               String entityType = entity.second();
               final String resolved = resolvedDest.replace(ENTITY_TYPE, entityType);
               if (doc.containsKey(resolved)) {
                 destField = doc.getField(resolved);
               } else {
                 SolrInputField targetField = destMap.get(resolved);
-                if (targetField == null) {
-                  destField = new SolrInputField(resolved);
-                } else {
-                  destField = targetField;
-                }
+                destField =
+                    Objects.requireNonNullElseGet(targetField, () -> new SolrInputField(resolved));
               }
               destField.addValue(entityName);
 
@@ -612,9 +609,7 @@ public void processAdd(AddUpdateCommand cmd) throws IOException {
           }
         }
 
-        for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
-          doc.put(entry.getKey(), entry.getValue());
-        }
+        doc.putAll(destMap);
         super.processAdd(cmd);
       }
 

diff --git a/...solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc b/...solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc
@@ -16,6 +16,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+:onnx: https://onnx.ai/
+
 Every update request received by Solr is run through a chain of plugins known as Update Request Processors, or _URPs_.
 
 This can be useful, for example, to add a field to the document being indexed; to change the value of a particular field; or to drop an update if the incoming document doesn't fulfill certain criteria.
@@ -430,6 +432,10 @@ The {solr-javadocs}/modules/analysis-extras/index.html[`analysis-extras`] module
 {solr-javadocs}/modules/analysis-extras/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.html[OpenNLPExtractNamedEntitiesUpdateProcessorFactory]::: Update document(s) to be indexed with named entities extracted using an OpenNLP NER model.
 Note that in order to use model files larger than 1MB on SolrCloud, you must xref:deployment-guide:zookeeper-ensemble#increasing-the-file-size-limit[configure both ZooKeeper server and clients].
 
+{solr-javadocs}/modules/analysis-extras/org/apache/solr/update/processor/DocumentCategorizerUpdateProcessorFactory.html[DocumentCategorizerUpdateProcessorFactory]::: Classify text in fields using models.  These models must be in {onnx}[ONNX] format and can be sourced from Huggingface and run directly in Solr via OpenNLP.
+Learn more by following the xref:getting-started:tutorial-opennlp.adoc[sentiment analysis tutorial with OpenNLP and ONNX models].
+
+
 === Update Processor Factories You Should _Not_ Modify or Remove
 
 These are listed for completeness, but are part of the Solr infrastructure, particularly SolrCloud.

diff --git a/solr/solr-ref-guide/modules/getting-started/getting-started-nav.adoc b/solr/solr-ref-guide/modules/getting-started/getting-started-nav.adoc
@@ -33,6 +33,7 @@
 ** xref:tutorial-paramsets.adoc[]
 ** xref:tutorial-vectors.adoc[]
 ** xref:tutorial-solrcloud.adoc[]
+** xref:tutorial-opennlp.adoc[]
 ** xref:tutorial-aws.adoc[]
 
 * xref:solr-admin-ui.adoc[]

diff --git a/solr/solr-ref-guide/modules/getting-started/pages/solr-tutorial.adoc b/solr/solr-ref-guide/modules/getting-started/pages/solr-tutorial.adoc
@@ -29,7 +29,7 @@ The xref:tutorial-films.adoc[second exercise] works with a different set of data
 The xref:tutorial-diy.adoc[third exercise] encourages you to begin to work with your own data and start a plan for your implementation.
 
 The tutorial also includes other, more advanced, exercises that introduce you to xref:tutorial-paramsets.adoc[ParamSets],
-xref:tutorial-vectors.adoc[vector search], xref:tutorial-solrcloud.adoc[SolrCloud], and xref:tutorial-aws.adoc[deploying Solr to AWS].
+xref:tutorial-vectors.adoc[vector search], xref:tutorial-opennlp.adoc[sentiment analysis with OpenNLP], xref:tutorial-solrcloud.adoc[SolrCloud], and xref:tutorial-aws.adoc[deploying Solr to AWS].
 
 Finally, we'll introduce <<Spatial Queries,spatial search>>, and show you how to get your Solr instance back into a clean state.
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,8 @@ New Features @@
     * SOLR-17780: Add support for scalar quantized dense vectors (Kevin Liang via Alessandro Benedetti)
+    * SOLR-17023: Use Modern NLP Models from Apache OpenNLP with Solr (Jeff Zemerick, Eric Pugh)
     Improvements
     ---------------------
@@ Expand Down @@