From 6b3f5a48e017e37134702ce8887c800c99ebe88e Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Wed, 19 Apr 2017 07:40:05 -0400 Subject: [PATCH] OPENNLP-1029: Add tests for InsufficientTrainingDataException. --- .../opennlp/tools/chunker/ChunkerMETest.java | 18 ++++++++++ .../doccat/DocumentCategorizerMETest.java | 17 ++++++++++ .../tools/lemmatizer/LemmatizerMETest.java | 21 ++++++++++-- .../TokenNameFinderCrossValidatorTest.java | 33 ++++++++++++++++--- .../opennlp/tools/postag/POSTaggerMETest.java | 24 ++++++++++++-- .../sentdetect/SentenceDetectorMETest.java | 28 ++++++++++++++-- .../tools/tokenize/TokenizerMETest.java | 26 +++++++++++++++ .../tools/chunker/test-insufficient.txt | 1 + .../lemmatizer/trial.old-insufficient.tsv | 1 + .../AnnotatedSentencesInsufficient.txt | 5 +++ .../postag/AnnotatedSentencesInsufficient.txt | 1 + .../sentdetect/SentencesInsufficient.txt | 1 + .../tools/tokenize/token-insufficient.train | 1 + 13 files changed, 166 insertions(+), 11 deletions(-) create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java index 51112df2c..facb408a6 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java @@ -27,6 +27,7 @@ import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Sequence; @@ -128,5 +129,22 @@ public void testTokenProbMinScore() throws Exception { Assert.assertEquals(Arrays.asList(expect1), preds[0].getOutcomes()); Assert.assertNotSame(Arrays.asList(expect1), preds[1].getOutcomes()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/chunker/test-insufficient.txt"); + + ObjectStream sampleStream = new ChunkSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "70"); + params.put(TrainingParameters.CUTOFF_PARAM, "1"); + + ChunkerME.train("en", sampleStream, params, new ChunkerFactory()); + + } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java index 220df875b..391125ea5 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java @@ -24,6 +24,7 @@ import org.junit.Assert; import org.junit.Test; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ObjectStreamUtils; import opennlp.tools.util.TrainingParameters; @@ -61,4 +62,20 @@ public void testSimpleTraining() throws IOException { Set cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void insufficientTestData() throws IOException { + + ObjectStream samples = ObjectStreamUtils.createObjectStream( + new DocumentSample("1", new String[]{"a", "b", "c"})); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); + + DocumentCategorizerME.train("x-unspecified", samples, + params, new DoccatFactory()); + + } + } diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java index 4631763b9..f00f2b433 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java @@ -24,6 +24,7 @@ import org.junit.Before; import org.junit.Test; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.MockInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; @@ -68,8 +69,8 @@ public void startup() throws IOException { new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8")); TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory()); @@ -84,5 +85,21 @@ public void testLemmasAsArray() throws Exception { Assert.assertArrayEquals(expect, lemmas); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + ObjectStream sampleStream = new LemmaSampleStream( + new PlainTextByLineStream(new MockInputStreamFactory( + new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")), + "UTF-8")); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); + + LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory()); + + } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java index 679726d52..9e31987fd 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java @@ -28,6 +28,7 @@ import opennlp.tools.cmdline.namefind.NameEvaluationErrorListener; import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; @@ -50,8 +51,8 @@ public void testWithNullResources() throws Exception { new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); @@ -77,8 +78,8 @@ public void testWithNameEvaluationErrorListener() throws Exception { new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); @@ -95,4 +96,28 @@ public void testWithNameEvaluationErrorListener() throws Exception { Assert.assertTrue(out.size() > 0); Assert.assertNotNull(cv.getFMeasure()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testWithInsufficientData() throws Exception { + + InputStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt"); + + ObjectStream sampleStream = new NameSampleDataStream( + new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); + + mlParams.put(TrainingParameters.ALGORITHM_PARAM, + ModelType.MAXENT.toString()); + + TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", + TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null); + + cv.evaluate(sampleStream, 2); + + } + } diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java index 51cae2c1d..e2bca4854 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java @@ -25,6 +25,7 @@ import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; @@ -50,8 +51,8 @@ private static ObjectStream createSampleStream() throws IOException { static POSModel trainPOSModel(ModelType type) throws IOException { TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, type.toString()); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); return POSTaggerME.train("en", createSampleStream(), params, new POSTaggerFactory()); @@ -85,4 +86,23 @@ public void testBuildNGramDictionary() throws IOException { ObjectStream samples = createSampleStream(); POSTaggerME.buildNGramDictionary(samples, 0); } + + @Test(expected = InsufficientTrainingDataException.class) + public void insufficientTestData() throws IOException { + + InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, + "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt"); + + ObjectStream stream = new WordTagSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name()); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); + + POSTaggerME.train("en", stream, params, new POSTaggerFactory()); + + } + } diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java index 43d58294c..220650d20 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java @@ -26,6 +26,7 @@ import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; import opennlp.tools.util.TrainingParameters; @@ -42,12 +43,14 @@ public void testSentenceDetector() throws IOException { "/opennlp/tools/sentdetect/Sentences.txt"); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "0"); + + SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null); SentenceModel sentdetectModel = SentenceDetectorME.train( "en", new SentenceSampleStream(new PlainTextByLineStream(in, - StandardCharsets.UTF_8)), true, null, mlParams); + StandardCharsets.UTF_8)), factory, mlParams); Assert.assertEquals("en", sentdetectModel.getLanguage()); @@ -132,4 +135,23 @@ public void testSentenceDetector() throws IOException { Assert.assertEquals(new Span(16, 56), pos[1]); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + InputStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/sentdetect/SentencesInsufficient.txt"); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "0"); + + SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null); + + SentenceDetectorME.train("en", + new SentenceSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams); + + } + } diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java index 5a7a8119b..14b9185b7 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java @@ -18,10 +18,18 @@ package opennlp.tools.tokenize; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + /** * Tests for the {@link TokenizerME} class. * @@ -65,4 +73,22 @@ public void testTokenizer() throws IOException { Assert.assertEquals("through", tokens[7]); Assert.assertEquals("!", tokens[8]); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + InputStreamFactory trainDataIn = new ResourceAsStreamFactory( + TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train"); + + ObjectStream samples = new TokenSampleStream( + new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8)); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "5"); + + TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams); + + } + } diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt new file mode 100644 index 000000000..a57859025 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt @@ -0,0 +1 @@ +Rockwell NNP B-NP \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv new file mode 100644 index 000000000..89c2aeea3 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv @@ -0,0 +1 @@ +The DT the \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt new file mode 100644 index 000000000..c70ec6d18 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt @@ -0,0 +1,5 @@ +Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years. +I just knew his name , Alan McKennedy , and I'd heard the rumour that he'd moved to Scotland, the country of his ancestors. +So I called Julie , a friend who's still in contact with him. +She told me that he lived in 23213 Edinburgh, Worcesterstreet 12. +I wrote him a letter right away and he answered soon, sounding very happy and delighted. \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt new file mode 100644 index 000000000..786f182d1 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt @@ -0,0 +1 @@ +Find_VB out_RP. \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt new file mode 100644 index 000000000..0465ce206 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt @@ -0,0 +1 @@ +Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years. diff --git a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train new file mode 100644 index 000000000..db4a49d2d --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train @@ -0,0 +1 @@ +I tried to find out the address of an old school . \ No newline at end of file