From 9d2570566a50878e1297c2c85806cf30ff44b70c Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Mon, 13 Feb 2017 07:57:21 -0500 Subject: [PATCH] OPENNLP-983: Makes suffix/prefix length configurable. --- .../util/featuregen/GeneratorFactory.java | 22 ++++- .../featuregen/PrefixFeatureGenerator.java | 32 +++++-- .../featuregen/SuffixFeatureGenerator.java | 33 +++++-- .../PrefixFeatureGeneratorTest.java | 95 ++++++++++++++++++ .../SuffixFeatureGeneratorTest.java | 96 +++++++++++++++++++ 5 files changed, 258 insertions(+), 20 deletions(-) create mode 100644 opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java index fa97f43a6..ef08cfb28 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java @@ -555,7 +555,16 @@ static class PrefixFeatureGeneratorFactory implements XmlFeatureGeneratorFactory public AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) { - return new PrefixFeatureGenerator(); + + String attribute = generatorElement.getAttribute("length"); + + int prefixLength = PrefixFeatureGenerator.DEFAULT_MAX_LENGTH; + + if (!Objects.equals(attribute, "")) { + prefixLength = Integer.parseInt(attribute); + } + + return new PrefixFeatureGenerator(prefixLength); } static void register(Map factoryMap) { @@ -570,7 +579,16 @@ static class SuffixFeatureGeneratorFactory implements XmlFeatureGeneratorFactory public AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) { - return new SuffixFeatureGenerator(); + + String attribute = generatorElement.getAttribute("length"); + + int suffixLength = SuffixFeatureGenerator.DEFAULT_MAX_LENGTH; + + if (!Objects.equals(attribute, "")) { + suffixLength = Integer.parseInt(attribute); + } + + return new SuffixFeatureGenerator(suffixLength); } static void register(Map factoryMap) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java index 8cdd48f88..a47330fd8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java @@ -21,21 +21,35 @@ public class PrefixFeatureGenerator implements AdaptiveFeatureGenerator { - private static final int PREFIX_LENGTH = 4; - - private static String[] getPrefixes(String lex) { - String[] prefs = new String[PREFIX_LENGTH]; - for (int li = 0; li < PREFIX_LENGTH; li++) { - prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); - } - return prefs; + public static final int DEFAULT_MAX_LENGTH = 4; + + private final int prefixLength; + + public PrefixFeatureGenerator() { + prefixLength = DEFAULT_MAX_LENGTH; + } + + public PrefixFeatureGenerator(int prefixLength) { + this.prefixLength = prefixLength; } + @Override public void createFeatures(List features, String[] tokens, int index, String[] previousOutcomes) { - String[] prefs = PrefixFeatureGenerator.getPrefixes(tokens[index]); + String[] prefs = getPrefixes(tokens[index]); for (String pref : prefs) { features.add("pre=" + pref); } } + + private String[] getPrefixes(String lex) { + + int prefixes = Math.min(prefixLength, lex.length()); + + String[] prefs = new String[prefixes]; + for (int li = 0; li < prefixes; li++) { + prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); + } + return prefs; + } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java index a17fd4742..0ad266d28 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java @@ -21,21 +21,36 @@ public class SuffixFeatureGenerator implements AdaptiveFeatureGenerator { - private static final int SUFFIX_LENGTH = 4; - - public static String[] getSuffixes(String lex) { - String[] suffs = new String[SUFFIX_LENGTH]; - for (int li = 0; li < SUFFIX_LENGTH; li++) { - suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); - } - return suffs; + public static final int DEFAULT_MAX_LENGTH = 4; + + private final int suffixLength; + + public SuffixFeatureGenerator() { + suffixLength = DEFAULT_MAX_LENGTH; + } + + public SuffixFeatureGenerator(int suffixLength) { + this.suffixLength = suffixLength; } + @Override public void createFeatures(List features, String[] tokens, int index, String[] previousOutcomes) { - String[] suffs = SuffixFeatureGenerator.getSuffixes(tokens[index]); + String[] suffs = getSuffixes(tokens[index]); for (String suff : suffs) { features.add("suf=" + suff); } } + + private String[] getSuffixes(String lex) { + + int suffixes = Math.min(suffixLength, lex.length()); + + String[] suffs = new String[suffixes]; + for (int li = 0; li < suffixes; li++) { + suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); + } + return suffs; + } + } diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java new file mode 100644 index 000000000..2f83e4acc --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; + +public class PrefixFeatureGeneratorTest { + + private List features; + + @Before + public void setUp() throws Exception { + features = new ArrayList<>(); + } + + @Test + public void lengthTest1() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 0; + int suffixLength = 2; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("pre=T", features.get(0)); + Assert.assertEquals("pre=Th", features.get(1)); + + } + + @Test + public void lengthTest2() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 3; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(5, features.size()); + Assert.assertEquals("pre=e", features.get(0)); + Assert.assertEquals("pre=ex", features.get(1)); + Assert.assertEquals("pre=exa", features.get(2)); + Assert.assertEquals("pre=exam", features.get(3)); + Assert.assertEquals("pre=examp", features.get(4)); + + } + + @Test + public void lengthTest3() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 1; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("pre=i", features.get(0)); + Assert.assertEquals("pre=is", features.get(1)); + + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java new file mode 100644 index 000000000..bdfa61db2 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; +import opennlp.tools.util.featuregen.SuffixFeatureGenerator; + +public class SuffixFeatureGeneratorTest { + + private List features; + + @Before + public void setUp() throws Exception { + features = new ArrayList<>(); + } + + @Test + public void lengthTest1() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 0; + int suffixLength = 2; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("suf=s", features.get(0)); + Assert.assertEquals("suf=is", features.get(1)); + + } + + @Test + public void lengthTest2() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 3; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(5, features.size()); + Assert.assertEquals("suf=e", features.get(0)); + Assert.assertEquals("suf=le", features.get(1)); + Assert.assertEquals("suf=ple", features.get(2)); + Assert.assertEquals("suf=mple", features.get(3)); + Assert.assertEquals("suf=ample", features.get(4)); + + } + + @Test + public void lengthTest3() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 1; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("suf=s", features.get(0)); + Assert.assertEquals("suf=is", features.get(1)); + + } + +}