From 3eff02b12ccc450312609234219e1dee1c69656e Mon Sep 17 00:00:00 2001 From: ohtwadi Date: Wed, 13 Jun 2018 15:11:07 -0600 Subject: [PATCH] Add an example explaining how to use --- .../processor/URLClassifyProcessor.java | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java index 0844b6023fca..82f1946c44d0 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java @@ -33,14 +33,68 @@ import org.slf4j.LoggerFactory; /** + *

* Update processor which examines a URL and outputs to various other fields * characteristics of that URL, including length, number of path levels, whether * it is a top level URL (levels==0), whether it looks like a landing/index page, * a canonical representation of the URL (e.g. stripping index.html), the domain * and path parts of the URL etc. + *

+ * *

* This processor is intended used in connection with processing web resources, * and helping to produce values which may be used for boosting or filtering later. + *

+ * + *

+ * In the example configuration below, we construct a custom + * updateRequestProcessorChain and then instruct the + * /update requesthandler to use it for every incoming document. + *

+ *

+ * <updateRequestProcessorChain name="urlProcessor">
+ *   <processor class="org.apache.solr.update.processor.URLClassifyProcessorFactory">
+ *     <bool name="enabled">true</bool>
+ *     <str name="inputField">id</str>
+ *     <str name="domainOutputField">hostname</str>
+ *   </processor>
+ *   <processor class="solr.RunUpdateProcessorFactory" />
+ * </updateRequestProcessorChain>
+ * 
+ * <requestHandler name="/update" class="solr.UpdateRequestHandler"> + * <lst name="defaults"> + * <str name="update.chain">urlProcessor</str> + * </lst> + * </requestHandler> + *
+ *

+ * Then, at index time, Solr will look at the id field value and extract + * it's domain portion into a new hostname field. By default, the + * following fields will also be added + *

+ *

+ *

+ * For example, adding the following document + *

+ * { "id":"http://wwww.mydomain.com/subpath/document.html" }
+ * 
+ * will result in this resultant document in Solr + *
  */
 public class URLClassifyProcessor extends UpdateRequestProcessor {