GH-44: Add support for ephemeral (per-request) word and label filters.

carrot2 · Dec 1, 2020 · ecd161c · ecd161c
1 parent 04b64ec
commit ecd161c
Show file tree

Hide file tree

Showing 73 changed files with 1,146 additions and 516 deletions.
diff --git a/core-examples/src/test/java/org/carrot2/examples/E03_CustomLanguageComponents.java b/core-examples/src/test/java/org/carrot2/examples/E03_CustomLanguageComponents.java
@@ -11,6 +11,7 @@
 package org.carrot2.examples;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -19,6 +20,7 @@
 import java.util.Map;
 import java.util.ServiceLoader;
 import java.util.Set;
+import java.util.TreeMap;
 import java.util.function.Supplier;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
@@ -27,10 +29,12 @@
 import org.carrot2.clustering.Cluster;
 import org.carrot2.clustering.Document;
 import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
+import org.carrot2.language.DefaultDictionaryImpl;
+import org.carrot2.language.LabelFilter;
 import org.carrot2.language.LanguageComponents;
 import org.carrot2.language.LanguageComponentsProvider;
-import org.carrot2.language.LexicalData;
 import org.carrot2.language.Stemmer;
+import org.carrot2.language.StopwordFilter;
 import org.carrot2.language.Tokenizer;
 import org.carrot2.language.extras.LuceneAnalyzerTokenizerAdapter;
 import org.carrot2.text.preprocessing.LabelFormatter;
@@ -60,16 +64,25 @@ public void listAllAvailableComponents() throws IOException {
     // fragment-start{component-enumeration}
     ServiceLoader<LanguageComponentsProvider> providers =
         ServiceLoader.load(LanguageComponentsProvider.class);
-    for (LanguageComponentsProvider prov : providers) {
-      System.out.println("Provider class: " + prov.name());
 
-      for (String language : prov.languages()) {
-        System.out.println("  > " + language);
-        for (Class<?> componentClass : prov.componentTypes()) {
-          System.out.println("    Component: " + componentClass.getName());
-        }
+    Map<String, List<LanguageComponentsProvider>> langToProviders = new TreeMap<>();
+    for (LanguageComponentsProvider prov : providers) {
+      for (String lang : prov.languages()) {
+        langToProviders.computeIfAbsent(lang, (k) -> new ArrayList<>()).add(prov);
       }
     }
+
+    langToProviders.forEach(
+        (language, provList) -> {
+          System.out.println("  > " + language);
+          provList.forEach(
+              provider -> {
+                System.out.println("    [Provider: " + provider.name() + "]");
+                for (Class<?> componentClass : provider.componentTypes()) {
+                  System.out.println("      Component: " + componentClass.getName());
+                }
+              });
+        });
     // fragment-end{component-enumeration}
   }
 
@@ -96,6 +109,39 @@ public void tweakDefaultEnglishResources() throws IOException {
     ExamplesCommon.printClusters(clusters);
   }
 
+  @Test
+  public void useEphemeralDictionaries() throws IOException {
+    // It is often the case that clustering should be run against
+    // temporary, ephemeral lexical data. In this example we will supply such resources
+    // directly to the algorithm. Please note that there is a non-zero cost to compile
+    // ephemeral dictionaries for each clustering call. If these
+    // resources remain static, the LanguageComponents object should be overridden or modified
+    // instead.
+
+    // fragment-start{use-ephemeral-dictionary}
+    // Load the default dictionaries for English.
+    LanguageComponents english =
+        LanguageComponents.loader()
+            .limitToLanguages("English")
+            .limitToAlgorithms(new LingoClusteringAlgorithm())
+            .load()
+            .language("English");
+
+    LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm();
+
+    // Create an ephemeral label filter by providing a dictionary with a
+    // few regexp exclusion patterns.
+    DefaultDictionaryImpl labelFilter = new DefaultDictionaryImpl();
+    labelFilter.regexp.set("(?i).*data.*", "(?i).*mining.*");
+    algorithm.dictionaries.labelFilters.set(List.of(labelFilter));
+    // fragment-end{use-ephemeral-dictionary}
+
+    algorithm.desiredClusterCount.set(10);
+    List<Cluster<Document>> clusters = algorithm.cluster(ExamplesData.documentStream(), english);
+    System.out.println("Clusters:");
+    ExamplesCommon.printClusters(clusters);
+  }
+
   @Test
   public void overrideDefaultComponents() throws IOException {
     // There are language-specific components required for clustering and each algorithm may
@@ -106,30 +152,29 @@ public void overrideDefaultComponents() throws IOException {
     // requirements. Here, we modify the stemmer and lexical data for the default English
     // component set, leaving any other components as they were originally defined for English.
 
-    // We override the suppliers of Stemmer and LexicalData interfaces. These suppliers must be
-    // thread-safe, but the instances of corresponding components will not be reused across threads.
+    // We override the suppliers of stemming, stop word filtering and label filtering interfaces.
+    // These suppliers must be thread-safe, but the instances of corresponding components will not
+    // be reused across threads.
 
     // fragment-start{custom-stemmer}
     Supplier<Stemmer> stemmerSupplier;
     stemmerSupplier = () -> (word) -> word.toString().toLowerCase(Locale.ROOT);
     // fragment-end{custom-stemmer}
 
     // fragment-start{custom-lexical-data}
+    // Ignore words from the list and anything shorter than 4 characters.
     final Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
-    Supplier<LexicalData> lexicalDataSupplier =
-        () ->
-            new LexicalData() {
-              @Override
-              public boolean ignoreLabel(CharSequence candidate) {
-                // Ignore any label that has a substring 'data' in it.
-                return candidate.toString().toLowerCase(Locale.ROOT).contains("data");
-              }
-
-              @Override
-              public boolean ignoreWord(CharSequence word) {
-                return word.length() < 4 || ignored.contains(word.toString());
-              }
-            };
+    final StopwordFilter wordFilter =
+        (word) -> {
+          // Ignore any word shorter than 4 characters or on the explicit exclusion list.
+          return word.length() < 4 || ignored.contains(word.toString());
+        };
+
+    final LabelFilter labelFilter =
+        (label) -> {
+          // Ignore any label that has a substring 'data' in it.
+          return label.toString().toLowerCase(Locale.ROOT).contains("data");
+        };
     // fragment-end{custom-lexical-data}
 
     // fragment-start{custom-overrides}
@@ -138,7 +183,10 @@ public boolean ignoreWord(CharSequence word) {
             .load()
             .language("English")
             .override(Stemmer.class, stemmerSupplier)
-            .override(LexicalData.class, lexicalDataSupplier);
+            // Word and label filters are thread-safe here so we
+            // supply the same instance all the time.
+            .override(StopwordFilter.class, () -> wordFilter)
+            .override(LabelFilter.class, () -> labelFilter);
     // fragment-end{custom-overrides}
 
     LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm();
@@ -160,23 +208,19 @@ public void customLanguagePipeline() throws IOException {
         Stemmer.class,
         (Supplier<Stemmer>) () -> ((word) -> word.toString().toLowerCase(Locale.ROOT)));
 
-    suppliers.put(
-        LexicalData.class,
-        () ->
-            new LexicalData() {
-              Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
-
-              @Override
-              public boolean ignoreLabel(CharSequence labelCandidate) {
-                // Ignore any label that has a substring 'data' in it; example.
-                return labelCandidate.toString().toLowerCase(Locale.ROOT).contains("data");
-              }
-
-              @Override
-              public boolean ignoreWord(CharSequence word) {
-                return word.length() <= 3 || ignored.contains(word.toString());
-              }
-            });
+    final Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
+    final StopwordFilter wordFilter =
+        (word) -> {
+          return word.length() <= 3 || ignored.contains(word.toString());
+        };
+    suppliers.put(StopwordFilter.class, () -> wordFilter);
+
+    final LabelFilter labelFilter =
+        (label) -> {
+          // Ignore any label that has a substring 'data' in it.
+          return label.toString().toLowerCase(Locale.ROOT).contains("data");
+        };
+    suppliers.put(LabelFilter.class, () -> labelFilter);
 
     // Use an ICU analyzer from Lucene and an adapter to Tokenizer interface.
     class ICUAnalyzer extends Analyzer {

diff --git a/core/src/main/java/org/carrot2/attrs/AttrObjectArray.java b/core/src/main/java/org/carrot2/attrs/AttrObjectArray.java
@@ -55,9 +55,7 @@ public Class<T> getInterfaceClass() {
   public boolean isDefaultClass(Object value) {
     Objects.requireNonNull(value);
     T def = newDefaultEntryValue();
-    return def != null
-        && Objects.equals(def.getClass(), value.getClass())
-        && Objects.equals(clazz, value.getClass());
+    return def != null && Objects.equals(def.getClass(), value.getClass());
   }
 
   public T newDefaultEntryValue() {

diff --git a/core/src/main/java/org/carrot2/attrs/AttrStringArray.java b/core/src/main/java/org/carrot2/attrs/AttrStringArray.java
@@ -23,6 +23,14 @@ public void set(String value, String... values) {
     super.set(Stream.concat(Stream.of(value), Stream.of(values)).toArray(String[]::new));
   }
 
+  /**
+   * @return Return true if the value of this attribute is {@code null} or an empty array.
+   * @since 4.1.0
+   */
+  public boolean isEmpty() {
+    return get() == null || get().length == 0;
+  }
+
   public static class Builder extends BuilderScaffold<String[]> {
     public AttrStringArray defaultValue(String value, String... values) {
       return defaultValue(

diff --git a/core/src/main/java/org/carrot2/clustering/ClassNameAliases.java b/core/src/main/java/org/carrot2/clustering/ClassNameAliases.java
@@ -18,6 +18,7 @@
 import org.carrot2.clustering.lingo.SimpleLabelAssigner;
 import org.carrot2.clustering.lingo.UniqueLabelAssigner;
 import org.carrot2.clustering.stc.STCClusteringAlgorithm;
+import org.carrot2.language.EphemeralDictionaries;
 import org.carrot2.math.matrix.KMeansMatrixFactorizationFactory;
 import org.carrot2.math.matrix.LocalNonnegativeMatrixFactorizationFactory;
 import org.carrot2.math.matrix.NonnegativeMatrixFactorizationEDFactory;
@@ -107,7 +108,8 @@ public AliasMapper mapper() {
         .alias(
             "PartialSingularValueDecompositionFactory",
             PartialSingularValueDecompositionFactory.class,
-            PartialSingularValueDecompositionFactory::new);
+            PartialSingularValueDecompositionFactory::new)
+        .alias("EphemeralDictionaries", EphemeralDictionaries.class, EphemeralDictionaries::new);
   }
 
   @Override

diff --git a/core/src/main/java/org/carrot2/clustering/kmeans/BisectingKMeansClusteringAlgorithm.java b/core/src/main/java/org/carrot2/clustering/kmeans/BisectingKMeansClusteringAlgorithm.java
@@ -37,9 +37,12 @@
 import org.carrot2.clustering.ClusteringAlgorithm;
 import org.carrot2.clustering.Document;
 import org.carrot2.clustering.SharedInfrastructure;
+import org.carrot2.internal.clustering.ClusteringAlgorithmUtilities;
+import org.carrot2.language.EphemeralDictionaries;
+import org.carrot2.language.LabelFilter;
 import org.carrot2.language.LanguageComponents;
-import org.carrot2.language.LexicalData;
 import org.carrot2.language.Stemmer;
+import org.carrot2.language.StopwordFilter;
 import org.carrot2.language.Tokenizer;
 import org.carrot2.math.mahout.function.Functions;
 import org.carrot2.math.mahout.matrix.DoubleMatrix1D;
@@ -64,7 +67,12 @@ public class BisectingKMeansClusteringAlgorithm extends AttrComposite
     implements ClusteringAlgorithm {
   private static final Set<Class<?>> REQUIRED_LANGUAGE_COMPONENTS =
       new HashSet<>(
-          Arrays.asList(Stemmer.class, Tokenizer.class, LexicalData.class, LabelFormatter.class));
+          Arrays.asList(
+              Stemmer.class,
+              Tokenizer.class,
+              StopwordFilter.class,
+              LabelFilter.class,
+              LabelFormatter.class));
 
   public static final String NAME = "Bisecting K-Means";
 
@@ -146,6 +154,18 @@ public class BisectingKMeansClusteringAlgorithm extends AttrComposite
             .defaultValue(BasicPreprocessingPipeline::new));
   }
 
+  /**
+   * Per-request overrides of language components (dictionaries).
+   *
+   * @since 4.1.0
+   */
+  public EphemeralDictionaries dictionaries;
+
+  {
+    ClusteringAlgorithmUtilities.registerDictionaries(
+        attributes, () -> dictionaries, (v) -> dictionaries = v);
+  }
+
   @Override
   public Set<Class<?>> requiredLanguageComponents() {
     return REQUIRED_LANGUAGE_COMPONENTS;
@@ -156,6 +176,11 @@ public <T extends Document> List<Cluster<T>> cluster(
       Stream<? extends T> docStream, LanguageComponents languageComponents) {
     List<T> documents = docStream.collect(Collectors.toList());
 
+    // Apply ephemeral dictionaries.
+    if (this.dictionaries != null) {
+      languageComponents = this.dictionaries.override(languageComponents);
+    }
+
     // Preprocessing of documents
     final PreprocessingContext preprocessingContext =
         preprocessing.preprocess(documents.stream(), queryHint.get(), languageComponents);

diff --git a/core/src/main/java/org/carrot2/clustering/lingo/LingoClusteringAlgorithm.java b/core/src/main/java/org/carrot2/clustering/lingo/LingoClusteringAlgorithm.java
@@ -27,9 +27,12 @@
 import org.carrot2.clustering.ClusteringAlgorithm;
 import org.carrot2.clustering.Document;
 import org.carrot2.clustering.SharedInfrastructure;
+import org.carrot2.internal.clustering.ClusteringAlgorithmUtilities;
+import org.carrot2.language.EphemeralDictionaries;
+import org.carrot2.language.LabelFilter;
 import org.carrot2.language.LanguageComponents;
-import org.carrot2.language.LexicalData;
 import org.carrot2.language.Stemmer;
+import org.carrot2.language.StopwordFilter;
 import org.carrot2.language.Tokenizer;
 import org.carrot2.text.preprocessing.CompletePreprocessingPipeline;
 import org.carrot2.text.preprocessing.LabelFormatter;
@@ -49,7 +52,12 @@ public class LingoClusteringAlgorithm extends AttrComposite implements Clusterin
 
   private static final Set<Class<?>> REQUIRED_LANGUAGE_COMPONENTS =
       new HashSet<>(
-          Arrays.asList(Stemmer.class, Tokenizer.class, LexicalData.class, LabelFormatter.class));
+          Arrays.asList(
+              Stemmer.class,
+              Tokenizer.class,
+              StopwordFilter.class,
+              LabelFilter.class,
+              LabelFormatter.class));
 
   /**
    * Balance between cluster score and size during cluster sorting. Value equal to 0.0 will cause
@@ -119,6 +127,18 @@ public class LingoClusteringAlgorithm extends AttrComposite implements Clusterin
             .defaultValue(ClusterBuilder::new));
   }
 
+  /**
+   * Per-request overrides of language components (dictionaries).
+   *
+   * @since 4.1.0
+   */
+  public EphemeralDictionaries dictionaries;
+
+  {
+    ClusteringAlgorithmUtilities.registerDictionaries(
+        attributes, () -> dictionaries, (v) -> dictionaries = v);
+  }
+
   /**
    * Query terms used to retrieve documents being clustered. The query is used as a hint to avoid
    * creating trivial clusters consisting only of query words.
@@ -137,6 +157,11 @@ public <T extends Document> List<Cluster<T>> cluster(
       Stream<? extends T> docStream, LanguageComponents languageComponents) {
     List<T> documents = docStream.collect(Collectors.toList());
 
+    // Apply ephemeral dictionaries.
+    if (this.dictionaries != null) {
+      languageComponents = this.dictionaries.override(languageComponents);
+    }
+
     // Preprocessing of documents
     final PreprocessingContext context =
         preprocessing.preprocess(documents.stream(), queryHint.get(), languageComponents);