Skip to content

Commit

Permalink
GH-44: Add support for ephemeral (per-request) word and label filters.
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss committed Dec 1, 2020
1 parent 04b64ec commit ecd161c
Show file tree
Hide file tree
Showing 73 changed files with 1,146 additions and 516 deletions.
Expand Up @@ -11,6 +11,7 @@
package org.carrot2.examples;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
Expand All @@ -19,6 +20,7 @@
import java.util.Map;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
Expand All @@ -27,10 +29,12 @@
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.Document;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.language.DefaultDictionaryImpl;
import org.carrot2.language.LabelFilter;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.LanguageComponentsProvider;
import org.carrot2.language.LexicalData;
import org.carrot2.language.Stemmer;
import org.carrot2.language.StopwordFilter;
import org.carrot2.language.Tokenizer;
import org.carrot2.language.extras.LuceneAnalyzerTokenizerAdapter;
import org.carrot2.text.preprocessing.LabelFormatter;
Expand Down Expand Up @@ -60,16 +64,25 @@ public void listAllAvailableComponents() throws IOException {
// fragment-start{component-enumeration}
ServiceLoader<LanguageComponentsProvider> providers =
ServiceLoader.load(LanguageComponentsProvider.class);
for (LanguageComponentsProvider prov : providers) {
System.out.println("Provider class: " + prov.name());

for (String language : prov.languages()) {
System.out.println(" > " + language);
for (Class<?> componentClass : prov.componentTypes()) {
System.out.println(" Component: " + componentClass.getName());
}
Map<String, List<LanguageComponentsProvider>> langToProviders = new TreeMap<>();
for (LanguageComponentsProvider prov : providers) {
for (String lang : prov.languages()) {
langToProviders.computeIfAbsent(lang, (k) -> new ArrayList<>()).add(prov);
}
}

langToProviders.forEach(
(language, provList) -> {
System.out.println(" > " + language);
provList.forEach(
provider -> {
System.out.println(" [Provider: " + provider.name() + "]");
for (Class<?> componentClass : provider.componentTypes()) {
System.out.println(" Component: " + componentClass.getName());
}
});
});
// fragment-end{component-enumeration}
}

Expand All @@ -96,6 +109,39 @@ public void tweakDefaultEnglishResources() throws IOException {
ExamplesCommon.printClusters(clusters);
}

@Test
public void useEphemeralDictionaries() throws IOException {
// It is often the case that clustering should be run against
// temporary, ephemeral lexical data. In this example we will supply such resources
// directly to the algorithm. Please note that there is a non-zero cost to compile
// ephemeral dictionaries for each clustering call. If these
// resources remain static, the LanguageComponents object should be overridden or modified
// instead.

// fragment-start{use-ephemeral-dictionary}
// Load the default dictionaries for English.
LanguageComponents english =
LanguageComponents.loader()
.limitToLanguages("English")
.limitToAlgorithms(new LingoClusteringAlgorithm())
.load()
.language("English");

LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm();

// Create an ephemeral label filter by providing a dictionary with a
// few regexp exclusion patterns.
DefaultDictionaryImpl labelFilter = new DefaultDictionaryImpl();
labelFilter.regexp.set("(?i).*data.*", "(?i).*mining.*");
algorithm.dictionaries.labelFilters.set(List.of(labelFilter));
// fragment-end{use-ephemeral-dictionary}

algorithm.desiredClusterCount.set(10);
List<Cluster<Document>> clusters = algorithm.cluster(ExamplesData.documentStream(), english);
System.out.println("Clusters:");
ExamplesCommon.printClusters(clusters);
}

@Test
public void overrideDefaultComponents() throws IOException {
// There are language-specific components required for clustering and each algorithm may
Expand All @@ -106,30 +152,29 @@ public void overrideDefaultComponents() throws IOException {
// requirements. Here, we modify the stemmer and lexical data for the default English
// component set, leaving any other components as they were originally defined for English.

// We override the suppliers of Stemmer and LexicalData interfaces. These suppliers must be
// thread-safe, but the instances of corresponding components will not be reused across threads.
// We override the suppliers of stemming, stop word filtering and label filtering interfaces.
// These suppliers must be thread-safe, but the instances of corresponding components will not
// be reused across threads.

// fragment-start{custom-stemmer}
Supplier<Stemmer> stemmerSupplier;
stemmerSupplier = () -> (word) -> word.toString().toLowerCase(Locale.ROOT);
// fragment-end{custom-stemmer}

// fragment-start{custom-lexical-data}
// Ignore words from the list and anything shorter than 4 characters.
final Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
Supplier<LexicalData> lexicalDataSupplier =
() ->
new LexicalData() {
@Override
public boolean ignoreLabel(CharSequence candidate) {
// Ignore any label that has a substring 'data' in it.
return candidate.toString().toLowerCase(Locale.ROOT).contains("data");
}

@Override
public boolean ignoreWord(CharSequence word) {
return word.length() < 4 || ignored.contains(word.toString());
}
};
final StopwordFilter wordFilter =
(word) -> {
// Ignore any word shorter than 4 characters or on the explicit exclusion list.
return word.length() < 4 || ignored.contains(word.toString());
};

final LabelFilter labelFilter =
(label) -> {
// Ignore any label that has a substring 'data' in it.
return label.toString().toLowerCase(Locale.ROOT).contains("data");
};
// fragment-end{custom-lexical-data}

// fragment-start{custom-overrides}
Expand All @@ -138,7 +183,10 @@ public boolean ignoreWord(CharSequence word) {
.load()
.language("English")
.override(Stemmer.class, stemmerSupplier)
.override(LexicalData.class, lexicalDataSupplier);
// Word and label filters are thread-safe here so we
// supply the same instance all the time.
.override(StopwordFilter.class, () -> wordFilter)
.override(LabelFilter.class, () -> labelFilter);
// fragment-end{custom-overrides}

LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm();
Expand All @@ -160,23 +208,19 @@ public void customLanguagePipeline() throws IOException {
Stemmer.class,
(Supplier<Stemmer>) () -> ((word) -> word.toString().toLowerCase(Locale.ROOT)));

suppliers.put(
LexicalData.class,
() ->
new LexicalData() {
Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));

@Override
public boolean ignoreLabel(CharSequence labelCandidate) {
// Ignore any label that has a substring 'data' in it; example.
return labelCandidate.toString().toLowerCase(Locale.ROOT).contains("data");
}

@Override
public boolean ignoreWord(CharSequence word) {
return word.length() <= 3 || ignored.contains(word.toString());
}
});
final Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
final StopwordFilter wordFilter =
(word) -> {
return word.length() <= 3 || ignored.contains(word.toString());
};
suppliers.put(StopwordFilter.class, () -> wordFilter);

final LabelFilter labelFilter =
(label) -> {
// Ignore any label that has a substring 'data' in it.
return label.toString().toLowerCase(Locale.ROOT).contains("data");
};
suppliers.put(LabelFilter.class, () -> labelFilter);

// Use an ICU analyzer from Lucene and an adapter to Tokenizer interface.
class ICUAnalyzer extends Analyzer {
Expand Down
4 changes: 1 addition & 3 deletions core/src/main/java/org/carrot2/attrs/AttrObjectArray.java
Expand Up @@ -55,9 +55,7 @@ public Class<T> getInterfaceClass() {
public boolean isDefaultClass(Object value) {
Objects.requireNonNull(value);
T def = newDefaultEntryValue();
return def != null
&& Objects.equals(def.getClass(), value.getClass())
&& Objects.equals(clazz, value.getClass());
return def != null && Objects.equals(def.getClass(), value.getClass());
}

public T newDefaultEntryValue() {
Expand Down
8 changes: 8 additions & 0 deletions core/src/main/java/org/carrot2/attrs/AttrStringArray.java
Expand Up @@ -23,6 +23,14 @@ public void set(String value, String... values) {
super.set(Stream.concat(Stream.of(value), Stream.of(values)).toArray(String[]::new));
}

/**
* @return Return true if the value of this attribute is {@code null} or an empty array.
* @since 4.1.0
*/
public boolean isEmpty() {
return get() == null || get().length == 0;
}

public static class Builder extends BuilderScaffold<String[]> {
public AttrStringArray defaultValue(String value, String... values) {
return defaultValue(
Expand Down
Expand Up @@ -18,6 +18,7 @@
import org.carrot2.clustering.lingo.SimpleLabelAssigner;
import org.carrot2.clustering.lingo.UniqueLabelAssigner;
import org.carrot2.clustering.stc.STCClusteringAlgorithm;
import org.carrot2.language.EphemeralDictionaries;
import org.carrot2.math.matrix.KMeansMatrixFactorizationFactory;
import org.carrot2.math.matrix.LocalNonnegativeMatrixFactorizationFactory;
import org.carrot2.math.matrix.NonnegativeMatrixFactorizationEDFactory;
Expand Down Expand Up @@ -107,7 +108,8 @@ public AliasMapper mapper() {
.alias(
"PartialSingularValueDecompositionFactory",
PartialSingularValueDecompositionFactory.class,
PartialSingularValueDecompositionFactory::new);
PartialSingularValueDecompositionFactory::new)
.alias("EphemeralDictionaries", EphemeralDictionaries.class, EphemeralDictionaries::new);
}

@Override
Expand Down
Expand Up @@ -37,9 +37,12 @@
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.clustering.SharedInfrastructure;
import org.carrot2.internal.clustering.ClusteringAlgorithmUtilities;
import org.carrot2.language.EphemeralDictionaries;
import org.carrot2.language.LabelFilter;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.LexicalData;
import org.carrot2.language.Stemmer;
import org.carrot2.language.StopwordFilter;
import org.carrot2.language.Tokenizer;
import org.carrot2.math.mahout.function.Functions;
import org.carrot2.math.mahout.matrix.DoubleMatrix1D;
Expand All @@ -64,7 +67,12 @@ public class BisectingKMeansClusteringAlgorithm extends AttrComposite
implements ClusteringAlgorithm {
private static final Set<Class<?>> REQUIRED_LANGUAGE_COMPONENTS =
new HashSet<>(
Arrays.asList(Stemmer.class, Tokenizer.class, LexicalData.class, LabelFormatter.class));
Arrays.asList(
Stemmer.class,
Tokenizer.class,
StopwordFilter.class,
LabelFilter.class,
LabelFormatter.class));

public static final String NAME = "Bisecting K-Means";

Expand Down Expand Up @@ -146,6 +154,18 @@ public class BisectingKMeansClusteringAlgorithm extends AttrComposite
.defaultValue(BasicPreprocessingPipeline::new));
}

/**
* Per-request overrides of language components (dictionaries).
*
* @since 4.1.0
*/
public EphemeralDictionaries dictionaries;

{
ClusteringAlgorithmUtilities.registerDictionaries(
attributes, () -> dictionaries, (v) -> dictionaries = v);
}

@Override
public Set<Class<?>> requiredLanguageComponents() {
return REQUIRED_LANGUAGE_COMPONENTS;
Expand All @@ -156,6 +176,11 @@ public <T extends Document> List<Cluster<T>> cluster(
Stream<? extends T> docStream, LanguageComponents languageComponents) {
List<T> documents = docStream.collect(Collectors.toList());

// Apply ephemeral dictionaries.
if (this.dictionaries != null) {
languageComponents = this.dictionaries.override(languageComponents);
}

// Preprocessing of documents
final PreprocessingContext preprocessingContext =
preprocessing.preprocess(documents.stream(), queryHint.get(), languageComponents);
Expand Down
Expand Up @@ -27,9 +27,12 @@
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.clustering.SharedInfrastructure;
import org.carrot2.internal.clustering.ClusteringAlgorithmUtilities;
import org.carrot2.language.EphemeralDictionaries;
import org.carrot2.language.LabelFilter;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.LexicalData;
import org.carrot2.language.Stemmer;
import org.carrot2.language.StopwordFilter;
import org.carrot2.language.Tokenizer;
import org.carrot2.text.preprocessing.CompletePreprocessingPipeline;
import org.carrot2.text.preprocessing.LabelFormatter;
Expand All @@ -49,7 +52,12 @@ public class LingoClusteringAlgorithm extends AttrComposite implements Clusterin

private static final Set<Class<?>> REQUIRED_LANGUAGE_COMPONENTS =
new HashSet<>(
Arrays.asList(Stemmer.class, Tokenizer.class, LexicalData.class, LabelFormatter.class));
Arrays.asList(
Stemmer.class,
Tokenizer.class,
StopwordFilter.class,
LabelFilter.class,
LabelFormatter.class));

/**
* Balance between cluster score and size during cluster sorting. Value equal to 0.0 will cause
Expand Down Expand Up @@ -119,6 +127,18 @@ public class LingoClusteringAlgorithm extends AttrComposite implements Clusterin
.defaultValue(ClusterBuilder::new));
}

/**
* Per-request overrides of language components (dictionaries).
*
* @since 4.1.0
*/
public EphemeralDictionaries dictionaries;

{
ClusteringAlgorithmUtilities.registerDictionaries(
attributes, () -> dictionaries, (v) -> dictionaries = v);
}

/**
* Query terms used to retrieve documents being clustered. The query is used as a hint to avoid
* creating trivial clusters consisting only of query words.
Expand All @@ -137,6 +157,11 @@ public <T extends Document> List<Cluster<T>> cluster(
Stream<? extends T> docStream, LanguageComponents languageComponents) {
List<T> documents = docStream.collect(Collectors.toList());

// Apply ephemeral dictionaries.
if (this.dictionaries != null) {
languageComponents = this.dictionaries.override(languageComponents);
}

// Preprocessing of documents
final PreprocessingContext context =
preprocessing.preprocess(documents.stream(), queryHint.get(), languageComponents);
Expand Down

0 comments on commit ecd161c

Please sign in to comment.