Skip to content

Commit

Permalink
GH-57: turn labelFilter and StopwordFilter into predicates.
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss committed Dec 15, 2020
1 parent a4dfb11 commit ad7d07e
Show file tree
Hide file tree
Showing 20 changed files with 85 additions and 132 deletions.
Expand Up @@ -167,13 +167,13 @@ public void overrideDefaultComponents() throws IOException {
final StopwordFilter wordFilter =
(word) -> {
// Ignore any word shorter than 4 characters or on the explicit exclusion list.
return word.length() < 4 || ignored.contains(word.toString());
return word.length() >= 4 && !ignored.contains(word.toString());
};

final LabelFilter labelFilter =
(label) -> {
// Ignore any label that has a substring 'data' in it.
return label.toString().toLowerCase(Locale.ROOT).contains("data");
return !label.toString().toLowerCase(Locale.ROOT).contains("data");
};
// fragment-end{custom-lexical-data}

Expand Down Expand Up @@ -211,14 +211,14 @@ public void customLanguagePipeline() throws IOException {
final Set<String> ignored = new HashSet<>(Arrays.asList("from", "what"));
final StopwordFilter wordFilter =
(word) -> {
return word.length() <= 3 || ignored.contains(word.toString());
return word.length() > 3 && !ignored.contains(word.toString());
};
suppliers.put(StopwordFilter.class, () -> wordFilter);

final LabelFilter labelFilter =
(label) -> {
// Ignore any label that has a substring 'data' in it.
return label.toString().toLowerCase(Locale.ROOT).contains("data");
return !label.toString().toLowerCase(Locale.ROOT).contains("data");
};
suppliers.put(LabelFilter.class, () -> labelFilter);

Expand Down
Expand Up @@ -424,7 +424,7 @@ protected void visit(int state, int cardinality, BitSet documents, IntStack path
// Build the candidate cluster's label for filtering. This may be costly so
// we only do this for base clusters which are promoted to merging phase.
assert cc.phrases.size() == 1;
if (!labelFilter.ignoreLabel(buildLabel(cc.phrases.get(0)))) {
if (labelFilter.test(buildLabel(cc.phrases.get(0)))) {
candidates.set(j++, cc);
}
}
Expand Down
33 changes: 0 additions & 33 deletions core/src/main/java/org/carrot2/language/ChainedLabelFilter.java

This file was deleted.

33 changes: 0 additions & 33 deletions core/src/main/java/org/carrot2/language/ChainedWordFilter.java

This file was deleted.

Expand Up @@ -51,14 +51,14 @@ public class DefaultDictionaryImpl extends AttrComposite

@Override
public StopwordFilter compileStopwordFilter() {
Predicate<String> precompiled = compile();
return (word) -> precompiled.test(word.toString());
Predicate<String> compiled = compile();
return (t) -> !compiled.test(t.toString());
}

@Override
public LabelFilter compileLabelFilter() {
Predicate<String> precompiled = compile();
return (label) -> precompiled.test(label.toString());
Predicate<String> compiled = compile();
return (t) -> !compiled.test(t.toString());
}

private Predicate<String> compile() {
Expand Down
34 changes: 14 additions & 20 deletions core/src/main/java/org/carrot2/language/EphemeralDictionaries.java
Expand Up @@ -10,10 +10,8 @@
*/
package org.carrot2.language;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.carrot2.attrs.AttrComposite;
import org.carrot2.attrs.AttrObjectArray;

Expand Down Expand Up @@ -44,7 +42,7 @@ public class EphemeralDictionaries extends AttrComposite {
*
* @see DefaultDictionaryImpl
* @see StopwordFilterDictionary
* @see StopwordFilter#ignoreWord
* @see StopwordFilter
*/
public AttrObjectArray<StopwordFilterDictionary> wordFilters =
attributes.register(
Expand Down Expand Up @@ -72,7 +70,7 @@ public class EphemeralDictionaries extends AttrComposite {
*
* @see DefaultDictionaryImpl
* @see LabelFilterDictionary
* @see LabelFilter#ignoreLabel
* @see LabelFilter
*/
public AttrObjectArray<LabelFilterDictionary> labelFilters =
attributes.register(
Expand All @@ -87,49 +85,45 @@ public class EphemeralDictionaries extends AttrComposite {
*/
public LanguageComponents override(LanguageComponents languageComponents) {
List<StopwordFilterDictionary> wordFilterAttrs = this.wordFilters.get();
if (wordFilterAttrs != null || !wordFilterAttrs.isEmpty()) {
List<StopwordFilter> wordFilters =
if (wordFilterAttrs != null && !wordFilterAttrs.isEmpty()) {
StopwordFilter dictFilter =
wordFilterAttrs.stream()
.map(StopwordFilterDictionary::compileStopwordFilter)
.collect(Collectors.toList());
.reduce(StopwordFilter::and)
.get();

languageComponents =
languageComponents.override(
StopwordFilter.class,
(previous) ->
() -> {
List<StopwordFilter> filters;
StopwordFilter previousFilter = previous.get();
if (previousFilter != null) {
filters = new ArrayList<>(wordFilters);
filters.add(previousFilter);
return new ChainedWordFilter(filters);
return previousFilter.and(dictFilter);
} else {
return new ChainedWordFilter(wordFilters);
return dictFilter;
}
});
}

List<LabelFilterDictionary> labelFilterAttrs = this.labelFilters.get();
if (labelFilterAttrs != null || !labelFilterAttrs.isEmpty()) {
List<LabelFilter> labelFilters =
if (labelFilterAttrs != null && !labelFilterAttrs.isEmpty()) {
LabelFilter dictFilter =
labelFilterAttrs.stream()
.map(LabelFilterDictionary::compileLabelFilter)
.collect(Collectors.toList());
.reduce(LabelFilter::and)
.get();

languageComponents =
languageComponents.override(
LabelFilter.class,
(previous) ->
() -> {
List<LabelFilter> filters;
LabelFilter previousFilter = previous.get();
if (previousFilter != null) {
filters = new ArrayList<>(labelFilters);
filters.add(previousFilter);
return new ChainedLabelFilter(filters);
return previousFilter.and(dictFilter);
} else {
return new ChainedLabelFilter(labelFilters);
return dictFilter;
}
});
}
Expand Down
21 changes: 17 additions & 4 deletions core/src/main/java/org/carrot2/language/LabelFilter.java
Expand Up @@ -10,15 +10,28 @@
*/
package org.carrot2.language;

import java.util.Objects;
import java.util.function.Predicate;

/**
* A cluster label candidate filter.
*
* @since 4.1.0
*/
@FunctionalInterface
// fragment-start{label-filter}
public interface LabelFilter {
/** @return Return true if the label candidate should be ignored in processing. */
boolean ignoreLabel(CharSequence labelCandidate);
public interface LabelFilter extends Predicate<CharSequence> {
/**
* @param label The label to test. Input labels may have mixed case, depending on the algorithm
* and their surface forms collected from input documents.
* @return Return {@code false} if the label candidate should be ignored in processing.
*/
boolean test(CharSequence label);
// fragment-end{label-filter}

@Override
default LabelFilter and(Predicate<? super CharSequence> other) {
Objects.requireNonNull(other);
return (t) -> test(t) && other.test(t);
}
}
// fragment-end{label-filter}
21 changes: 17 additions & 4 deletions core/src/main/java/org/carrot2/language/StopwordFilter.java
Expand Up @@ -10,6 +10,9 @@
*/
package org.carrot2.language;

import java.util.Objects;
import java.util.function.Predicate;

/**
* A stop word filter.
*
Expand All @@ -18,8 +21,18 @@
*/
@FunctionalInterface
// fragment-start{word-filter}
public interface StopwordFilter {
/** @return Return true if the provided term should be ignored in processing. */
boolean ignoreWord(CharSequence word);
public interface StopwordFilter extends Predicate<CharSequence> {
/**
* @param word The word to test. Input words are guaranteed to be in lower case (consistent with
* {@link Character#toLowerCase(int)}.
* @return Return {@code false} if the provided term should be ignored in processing.
*/
boolean test(CharSequence word);
// fragment-end{word-filter}

@Override
default StopwordFilter and(Predicate<? super CharSequence> other) {
Objects.requireNonNull(other);
return (t) -> test(t) && other.test(t);
}
}
// fragment-end{word-filter}
Expand Up @@ -43,7 +43,7 @@ public void mark(PreprocessingContext context) {

CharArrayUtils.toLowerCase(word, buffer);
mutableCharArray.reset(buffer, 0, word.length);
if (lexData.ignoreWord(mutableCharArray)) {
if (!lexData.test(mutableCharArray)) {
types[i] |= Tokenizer.TF_COMMON_WORD;
}
}
Expand Down
Expand Up @@ -38,12 +38,12 @@ public void filter(
public boolean acceptPhrase(PreprocessingContext context, int phraseIndex) {
final String formatedLabel =
context.format(labelFormatter, phraseIndex + context.allWords.image.length);
return !labelFilter.ignoreLabel(formatedLabel);
return labelFilter.test(formatedLabel);
}

@Override
public boolean acceptWord(PreprocessingContext context, int wordIndex) {
final String formattedLabel = context.format(labelFormatter, wordIndex);
return !labelFilter.ignoreLabel(formattedLabel);
return labelFilter.test(formattedLabel);
}
}
Expand Up @@ -10,7 +10,7 @@
*/
package org.carrot2.language;

import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;

import java.io.IOException;
import java.io.StringReader;
Expand Down Expand Up @@ -119,7 +119,7 @@ private void check(String language, String[][] stemmingData, String[] commonWord

StopwordFilter wordFilter = components.get(StopwordFilter.class);
for (String word : commonWords) {
assertTrue(wordFilter.ignoreWord(new MutableCharArray(word)));
assertFalse(wordFilter.test(new MutableCharArray(word)));
}
}
}
Expand Up @@ -64,11 +64,11 @@ public void testDefaultFilterAttrImpl() {
filter.regexp.set("foo.+");

LabelFilter labelFilter = filter.compileLabelFilter();
Assertions.assertThat(labelFilter.ignoreLabel("word1")).isTrue();
Assertions.assertThat(labelFilter.ignoreLabel("word2")).isTrue();
Assertions.assertThat(labelFilter.ignoreLabel("word3")).isFalse();
Assertions.assertThat(labelFilter.ignoreLabel("foobar")).isTrue();
Assertions.assertThat(labelFilter.ignoreLabel("prefix-foobar")).isFalse();
Assertions.assertThat(labelFilter.test("word1")).isFalse();
Assertions.assertThat(labelFilter.test("word2")).isFalse();
Assertions.assertThat(labelFilter.test("word3")).isTrue();
Assertions.assertThat(labelFilter.test("foobar")).isFalse();
Assertions.assertThat(labelFilter.test("prefix-foobar")).isTrue();
}

@Test
Expand Down Expand Up @@ -128,21 +128,21 @@ Entry negative(String... patterns) {
StopwordFilter swFilter = filter.compileStopwordFilter();

for (String positiveExample : e.positive) {
Assertions.assertThat(labelFilter.ignoreLabel(positiveExample))
Assertions.assertThat(labelFilter.test(positiveExample))
.as(e.patterns.toString() + " :: " + positiveExample)
.isTrue();
Assertions.assertThat(swFilter.ignoreWord(positiveExample))
.isFalse();
Assertions.assertThat(swFilter.test(positiveExample))
.as(e.patterns.toString() + " :: " + positiveExample)
.isTrue();
.isFalse();
}

for (String negativeExample : e.negative) {
Assertions.assertThat(labelFilter.ignoreLabel(negativeExample))
Assertions.assertThat(labelFilter.test(negativeExample))
.as(e.patterns.toString() + " :: " + negativeExample)
.isFalse();
Assertions.assertThat(swFilter.ignoreWord(negativeExample))
.isTrue();
Assertions.assertThat(swFilter.test(negativeExample))
.as(e.patterns.toString() + " :: " + negativeExample)
.isFalse();
.isTrue();
}
}
}
Expand Down
Expand Up @@ -21,8 +21,8 @@ public TestsLanguageComponentsFactoryVariant1() {

registerResourceless(Stemmer.class, () -> (word) -> null);
registerResourceless(Tokenizer.class, ExtendedWhitespaceTokenizer::new);
registerResourceless(StopwordFilter.class, () -> (word) -> false);
registerResourceless(LabelFilter.class, () -> (label) -> false);
registerResourceless(StopwordFilter.class, () -> (word) -> true);
registerResourceless(LabelFilter.class, () -> (label) -> true);
registerResourceless(LabelFormatter.class, () -> new LabelFormatterImpl(" "));
}

Expand Down
Expand Up @@ -21,9 +21,9 @@ public TestsLanguageComponentsFactoryVariant2() {

registerResourceless(Stemmer.class, this::createStemmer);
registerResourceless(Tokenizer.class, ExtendedWhitespaceTokenizer::new);
StopwordFilter wordFilter = (word) -> word.toString().contains("stop");
StopwordFilter wordFilter = (word) -> !word.toString().contains("stop");
registerResourceless(StopwordFilter.class, () -> wordFilter);
LabelFilter labelFilter = (label) -> label.toString().startsWith("stoplabel");
LabelFilter labelFilter = (label) -> !label.toString().startsWith("stoplabel");
registerResourceless(LabelFilter.class, () -> labelFilter);
registerResourceless(LabelFormatter.class, () -> new LabelFormatterImpl(" "));
}
Expand Down

0 comments on commit ad7d07e

Please sign in to comment.