From 09830bb445136472578c69d130b7de9d16e175fc Mon Sep 17 00:00:00 2001 From: smarthi Date: Thu, 26 Jan 2017 16:30:58 -0500 Subject: [PATCH] OPENNLP-962: Dictionary should implement SerializableArtifact --- .../opennlp/tools/dictionary/Dictionary.java | 52 ++++++++----------- ...zer.java => DictionaryEntryPersistor.java} | 10 ++-- .../tools/dictionary/serializer/Entry.java | 4 +- .../java/opennlp/tools/ngram/NGramModel.java | 6 +-- .../opennlp/tools/postag/POSDictionary.java | 6 +-- .../tokenize/DetokenizationDictionary.java | 6 +-- .../tools/util/model/ArtifactSerializer.java | 5 +- .../util/model/DictionarySerializer.java | 2 +- .../DummySentenceDetectorFactory.java | 21 ++++---- .../tools/tokenize/DummyTokenizerFactory.java | 13 +++-- .../tools/tokenize/TokenizerFactoryTest.java | 9 ++-- .../uima/normalizer/StringDictionary.java | 14 ++--- 12 files changed, 66 insertions(+), 82 deletions(-) rename opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/{DictionarySerializer.java => DictionaryEntryPersistor.java} (96%) diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java index e662bd332..3fd898694 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java +++ b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java @@ -29,15 +29,17 @@ import java.util.StringTokenizer; import opennlp.tools.dictionary.serializer.Attributes; -import opennlp.tools.dictionary.serializer.DictionarySerializer; +import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.StringList; import opennlp.tools.util.StringUtil; +import opennlp.tools.util.model.DictionarySerializer; +import opennlp.tools.util.model.SerializableArtifact; /** * This class is a dictionary. */ -public class Dictionary implements Iterable { +public class Dictionary implements Iterable, SerializableArtifact { private class StringListWrapper { @@ -108,29 +110,11 @@ public Dictionary(boolean caseSensitive) { /** * Initializes the {@link Dictionary} from an existing dictionary resource. * - * @param in + * @param in {@link InputStream} * @throws IOException */ public Dictionary(InputStream in) throws IOException { - isCaseSensitive = DictionarySerializer.create(in, entry -> put(entry.getTokens())); - } - - /** - * Loads a Dictionary from a XML file. - * - * @deprecated This constructor is deprecated. Passing the case sensitivity - * flag has no effect. Use - * {@link Dictionary#Dictionary(InputStream)} instead and set the - * case sensitivity during the dictionary creation. - * - * @param in - * the dictionary in its XML format - * @param caseSensitive - * has no effect - * @throws IOException - */ - public Dictionary(InputStream in, boolean caseSensitive) throws IOException { - this(in); + isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> put(entry.getTokens())); } /** @@ -163,7 +147,7 @@ public int getMaxTokenCount() { /** * Checks if this dictionary has the given entry. * - * @param tokens + * @param tokens query * @return true if it contains the entry otherwise false */ public boolean contains(StringList tokens) { @@ -173,7 +157,7 @@ public boolean contains(StringList tokens) { /** * Removes the given tokens form the current instance. * - * @param tokens + * @param tokens filter tokens */ public void remove(StringList tokens) { entrySet.remove(new StringListWrapper(tokens)); @@ -215,13 +199,12 @@ public int size() { /** * Writes the current instance to the given {@link OutputStream}. * - * @param out + * @param out {@link OutputStream} * @throws IOException */ public void serialize(OutputStream out) throws IOException { - Iterator entryIterator = new Iterator() - { + Iterator entryIterator = new Iterator() { private Iterator dictionaryIterator = Dictionary.this.iterator(); public boolean hasNext() { @@ -241,7 +224,7 @@ public void remove() { }; - DictionarySerializer.serialize(out, entryIterator, isCaseSensitive); + DictionaryEntryPersistor.serialize(out, entryIterator, isCaseSensitive); } @Override @@ -278,10 +261,8 @@ public String toString() { * Reads a dictionary which has one entry per line. The tokens inside an * entry are whitespace delimited. * - * @param in - * + * @param in {@link Reader} * @return the parsed dictionary - * * @throws IOException */ public static Dictionary parseOneEntryPerLine(Reader in) throws IOException { @@ -361,4 +342,13 @@ public boolean contains(Object obj) { } }; } + + /** + * Gets the Serializer Class for {@link Dictionary} + * @return {@link DictionarySerializer} + */ + @Override + public Class getArtifactSerializerClass() { + return DictionarySerializer.class; + } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionarySerializer.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java similarity index 96% rename from opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionarySerializer.java rename to opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java index 5cf68f072..42d460e06 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionarySerializer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java @@ -46,7 +46,7 @@ /** * This class is used by for reading and writing dictionaries of all kinds. */ -public class DictionarySerializer { +public class DictionaryEntryPersistor { // TODO: should check for invalid format, make it save private static class DictionaryContenthandler implements ContentHandler { @@ -238,12 +238,12 @@ public static boolean create(InputStream in, EntryInserter inserter) * * @throws IOException If an I/O error occurs * @deprecated Use - * {@link DictionarySerializer#serialize(java.io.OutputStream, java.util.Iterator, boolean)} instead + * {@link DictionaryEntryPersistor#serialize(java.io.OutputStream, java.util.Iterator, boolean)} instead */ @Deprecated public static void serialize(OutputStream out, Iterator entries) throws IOException { - DictionarySerializer.serialize(out, entries, true); + DictionaryEntryPersistor.serialize(out, entries, true); } /** @@ -319,11 +319,11 @@ private static void serializeEntry(TransformerHandler hd, Entry entry) StringList tokens = entry.getTokens(); - for (String token1 : tokens) { + for (String token : tokens) { hd.startElement("", "", TOKEN_ELEMENT, new AttributesImpl()); - hd.characters(token1.toCharArray(), 0, token1.length()); + hd.characters(token.toCharArray(), 0, token.length()); hd.endElement("", "", TOKEN_ELEMENT); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/Entry.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/Entry.java index 3f77a0df0..fbdfd36b5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/Entry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/Entry.java @@ -24,9 +24,9 @@ * An {@link Entry} is a {@link StringList} which can * optionally be mapped to attributes. * - * {@link Entry}s is a read and written by the {@link DictionarySerializer}. + * {@link Entry}s is a read and written by the {@link DictionaryEntryPersistor}. * - * @see DictionarySerializer + * @see DictionaryEntryPersistor * @see Attributes */ public class Entry { diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java index 25fb15943..7005dc475 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java @@ -27,7 +27,7 @@ import opennlp.tools.dictionary.Dictionary; import opennlp.tools.dictionary.serializer.Attributes; -import opennlp.tools.dictionary.serializer.DictionarySerializer; +import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; @@ -57,7 +57,7 @@ public NGramModel() { * @throws IOException */ public NGramModel(InputStream in) throws IOException { - DictionarySerializer.create(in, entry -> { + DictionaryEntryPersistor.create(in, entry -> { int count; String countValueString = null; @@ -327,7 +327,7 @@ public void remove() { }; - DictionarySerializer.serialize(out, entryIterator, false); + DictionaryEntryPersistor.serialize(out, entryIterator, false); } @Override diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java index f51828f43..f103450e8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java @@ -27,7 +27,7 @@ import java.util.Objects; import opennlp.tools.dictionary.serializer.Attributes; -import opennlp.tools.dictionary.serializer.DictionarySerializer; +import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; @@ -152,7 +152,7 @@ public void remove() { } }; - DictionarySerializer.serialize(out, entries, caseSensitive); + DictionaryEntryPersistor.serialize(out, entries, caseSensitive); } @Override @@ -224,7 +224,7 @@ public static POSDictionary create(InputStream in) throws IOException { final POSDictionary newPosDict = new POSDictionary(); - boolean isCaseSensitive = DictionarySerializer.create(in, entry -> { + boolean isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> { String tagString = entry.getAttributes().getValue("tags"); diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java index ff3bc9b12..9ffe64905 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java @@ -28,7 +28,7 @@ import java.util.Map; import opennlp.tools.dictionary.serializer.Attributes; -import opennlp.tools.dictionary.serializer.DictionarySerializer; +import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; @@ -118,7 +118,7 @@ public DetokenizationDictionary(File file) throws IOException { } private void init(InputStream in) throws IOException { - DictionarySerializer.create(in, entry -> { + DictionaryEntryPersistor.create(in, entry -> { String operationString = entry.getAttributes().getValue("operation"); @@ -166,6 +166,6 @@ public void remove() { } }; - DictionarySerializer.serialize(out, entries, false); + DictionaryEntryPersistor.serialize(out, entries, false); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactSerializer.java index f28cb0031..851dd6dcf 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactSerializer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactSerializer.java @@ -22,8 +22,6 @@ import java.io.InputStream; import java.io.OutputStream; -import opennlp.tools.util.InvalidFormatException; - /** * Responsible to create an artifact from an {@link InputStream}. */ @@ -37,9 +35,8 @@ public interface ArtifactSerializer { * @return the artifact * * @throws IOException - * @throws InvalidFormatException */ - T create(InputStream in) throws IOException, InvalidFormatException; + T create(InputStream in) throws IOException; /** * Serializes the artifact to the provided {@link OutputStream}. diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/DictionarySerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/DictionarySerializer.java index 9b5911242..092fa8eab 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/model/DictionarySerializer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/DictionarySerializer.java @@ -24,7 +24,7 @@ import opennlp.tools.dictionary.Dictionary; -class DictionarySerializer implements ArtifactSerializer { +public class DictionarySerializer implements ArtifactSerializer { public Dictionary create(InputStream in) throws IOException { return new Dictionary(in); diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DummySentenceDetectorFactory.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DummySentenceDetectorFactory.java index 15e0760d3..8873d86aa 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DummySentenceDetectorFactory.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DummySentenceDetectorFactory.java @@ -24,7 +24,6 @@ import java.util.Set; import opennlp.tools.dictionary.Dictionary; -import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.model.ArtifactSerializer; public class DummySentenceDetectorFactory extends SentenceDetectorFactory { @@ -69,9 +68,7 @@ public EndOfSentenceScanner getEndOfSentenceScanner() { @Override @SuppressWarnings("rawtypes") public Map createArtifactSerializersMap() { - Map serializers = super - .createArtifactSerializersMap(); - + Map serializers = super.createArtifactSerializersMap(); serializers.put(DUMMY_DICT, new DummyDictionarySerializer()); return serializers; } @@ -79,16 +76,15 @@ public Map createArtifactSerializersMap() { @Override public Map createArtifactMap() { Map artifactMap = super.createArtifactMap(); - if (this.dict != null) + if (this.dict != null) { artifactMap.put(DUMMY_DICT, this.dict); + } return artifactMap; } - static class DummyDictionarySerializer implements - ArtifactSerializer { + public static class DummyDictionarySerializer implements ArtifactSerializer { - public DummyDictionary create(InputStream in) throws IOException, - InvalidFormatException { + public DummyDictionary create(InputStream in) throws IOException { return new DummyDictionary(in); } @@ -98,7 +94,7 @@ public void serialize(DummyDictionary artifact, OutputStream out) } } - static class DummyDictionary extends Dictionary { + public static class DummyDictionary extends Dictionary { private Dictionary indict; public DummyDictionary(Dictionary dict) { @@ -116,6 +112,11 @@ public void serialize(OutputStream out) throws IOException { public Set asStringSet() { return indict.asStringSet(); } + + @Override + public Class getArtifactSerializerClass() { + return DummyDictionarySerializer.class; + } } static class DummySDContextGenerator extends DefaultSDContextGenerator { diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java index 468e4402a..937930dbe 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/DummyTokenizerFactory.java @@ -66,9 +66,7 @@ public TokenContextGenerator getContextGenerator() { @Override @SuppressWarnings("rawtypes") public Map createArtifactSerializersMap() { - Map serializers = super - .createArtifactSerializersMap(); - + Map serializers = super.createArtifactSerializersMap(); serializers.put(DUMMY_DICT, new DummyDictionarySerializer()); return serializers; } @@ -81,7 +79,7 @@ public Map createArtifactMap() { return artifactMap; } - static class DummyDictionarySerializer implements + public static class DummyDictionarySerializer implements ArtifactSerializer { public DummyDictionary create(InputStream in) throws IOException { @@ -94,7 +92,7 @@ public void serialize(DummyDictionary artifact, OutputStream out) } } - static class DummyDictionary extends Dictionary { + public static class DummyDictionary extends Dictionary { private Dictionary indict; public DummyDictionary(Dictionary dict) { @@ -112,6 +110,11 @@ public void serialize(OutputStream out) throws IOException { public Set asStringSet() { return indict.asStringSet(); } + + @Override + public Class getArtifactSerializerClass() { + return DummyDictionarySerializer.class; + } } static class DummyContextGenerator extends DefaultTokenContextGenerator { diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java index 056529a8a..b34459605 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java @@ -74,8 +74,7 @@ public void testDefault() throws IOException { Assert.assertTrue(factory.getAbbreviationDictionary() != null); Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern() - .pattern()); + Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); Assert.assertEquals(lang, factory.getLanguageCode()); Assert.assertEquals(lang, model.getLanguage()); Assert.assertFalse(factory.isUseAlphaNumericOptmization()); @@ -90,8 +89,7 @@ public void testDefault() throws IOException { Assert.assertTrue(factory.getAbbreviationDictionary() != null); Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern() - .pattern()); + Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); Assert.assertEquals(lang, factory.getLanguageCode()); Assert.assertEquals(lang, model.getLanguage()); Assert.assertFalse(factory.isUseAlphaNumericOptmization()); @@ -109,8 +107,7 @@ public void testNullDict() throws IOException { Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern() - .pattern()); + Assert.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); Assert.assertEquals(lang, factory.getLanguageCode()); Assert.assertEquals(lang, model.getLanguage()); Assert.assertFalse(factory.isUseAlphaNumericOptmization()); diff --git a/opennlp-uima/src/main/java/opennlp/uima/normalizer/StringDictionary.java b/opennlp-uima/src/main/java/opennlp/uima/normalizer/StringDictionary.java index 5631544e1..d477dbf14 100644 --- a/opennlp-uima/src/main/java/opennlp/uima/normalizer/StringDictionary.java +++ b/opennlp-uima/src/main/java/opennlp/uima/normalizer/StringDictionary.java @@ -25,10 +25,8 @@ import java.util.Map; import opennlp.tools.dictionary.serializer.Attributes; -import opennlp.tools.dictionary.serializer.DictionarySerializer; +import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; -import opennlp.tools.dictionary.serializer.EntryInserter; -import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; // lookup a string for given token list @@ -46,11 +44,9 @@ public StringDictionary() { * @throws IOException */ public StringDictionary(InputStream in) throws IOException { - DictionarySerializer.create(in, new EntryInserter() { - public void insert(Entry entry) throws InvalidFormatException { - String valueString = entry.getAttributes().getValue("value"); - put(entry.getTokens(), valueString); - } + DictionaryEntryPersistor.create(in, entry -> { + String valueString = entry.getAttributes().getValue("value"); + put(entry.getTokens(), valueString); }); } @@ -97,6 +93,6 @@ public void remove() { } }; - DictionarySerializer.serialize(out, entryIterator, true); + DictionaryEntryPersistor.serialize(out, entryIterator, true); } } \ No newline at end of file