From 6b3f5a48e017e37134702ce8887c800c99ebe88e Mon Sep 17 00:00:00 2001
From: jzonthemtn <jeffrey@mtnfog.com>
Date: Wed, 19 Apr 2017 07:40:05 -0400
Subject: [PATCH] OPENNLP-1029: Add tests for
 InsufficientTrainingDataException.

---
 .../opennlp/tools/chunker/ChunkerMETest.java  | 18 ++++++++++
 .../doccat/DocumentCategorizerMETest.java     | 17 ++++++++++
 .../tools/lemmatizer/LemmatizerMETest.java    | 21 ++++++++++--
 .../TokenNameFinderCrossValidatorTest.java    | 33 ++++++++++++++++---
 .../opennlp/tools/postag/POSTaggerMETest.java | 24 ++++++++++++--
 .../sentdetect/SentenceDetectorMETest.java    | 28 ++++++++++++++--
 .../tools/tokenize/TokenizerMETest.java       | 26 +++++++++++++++
 .../tools/chunker/test-insufficient.txt       |  1 +
 .../lemmatizer/trial.old-insufficient.tsv     |  1 +
 .../AnnotatedSentencesInsufficient.txt        |  5 +++
 .../postag/AnnotatedSentencesInsufficient.txt |  1 +
 .../sentdetect/SentencesInsufficient.txt      |  1 +
 .../tools/tokenize/token-insufficient.train   |  1 +
 13 files changed, 166 insertions(+), 11 deletions(-)
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
 create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index 51112df2c..facb408a6 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -27,6 +27,7 @@
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Sequence;
@@ -128,5 +129,22 @@ public void testTokenProbMinScore() throws Exception {
     Assert.assertEquals(Arrays.asList(expect1), preds[0].getOutcomes());
     Assert.assertNotSame(Arrays.asList(expect1), preds[1].getOutcomes());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/chunker/test-insufficient.txt");
+
+    ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(
+        new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    params.put(TrainingParameters.CUTOFF_PARAM, "1");
+
+    ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
+
+  }
 
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
index 220df875b..391125ea5 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
@@ -24,6 +24,7 @@
 import org.junit.Assert;
 import org.junit.Test;
 
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;
 import opennlp.tools.util.TrainingParameters;
@@ -61,4 +62,20 @@ public void testSimpleTraining() throws IOException {
     Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
     Assert.assertEquals(1, cat.size());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void insufficientTestData() throws IOException {
+
+    ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
+        new DocumentSample("1", new String[]{"a", "b", "c"}));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    DocumentCategorizerME.train("x-unspecified", samples,
+        params, new DoccatFactory());
+
+  }
+  
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
index 4631763b9..f00f2b433 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
@@ -24,6 +24,7 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.MockInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -68,8 +69,8 @@ public void startup() throws IOException {
           new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
 
     LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream,
         params, new LemmatizerFactory());
@@ -84,5 +85,21 @@ public void testLemmasAsArray() throws Exception {
 
     Assert.assertArrayEquals(expect, lemmas);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+ 
+    ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
+        new PlainTextByLineStream(new MockInputStreamFactory(
+            new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")),
+                "UTF-8"));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory());
+
+  }
 
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
index 679726d52..9e31987fd 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
@@ -28,6 +28,7 @@
 import opennlp.tools.cmdline.namefind.NameEvaluationErrorListener;
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
@@ -50,8 +51,8 @@ public void testWithNullResources() throws Exception {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -77,8 +78,8 @@ public void testWithNameEvaluationErrorListener() throws Exception {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -95,4 +96,28 @@ public void testWithNameEvaluationErrorListener() throws Exception {
     Assert.assertTrue(out.size() > 0);
     Assert.assertNotNull(cv.getFMeasure());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testWithInsufficientData() throws Exception {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt");
+
+    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
+        new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
+
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM,
+        ModelType.MAXENT.toString());
+
+    TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en",
+        TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null);
+
+    cv.evaluate(sampleStream, 2);
+
+  }
+  
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
index 51cae2c1d..e2bca4854 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
@@ -25,6 +25,7 @@
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
@@ -50,8 +51,8 @@ private static ObjectStream<POSSample> createSampleStream() throws IOException {
   static POSModel trainPOSModel(ModelType type) throws IOException {
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, type.toString());
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
 
     return POSTaggerME.train("en", createSampleStream(), params,
         new POSTaggerFactory());
@@ -85,4 +86,23 @@ public void testBuildNGramDictionary() throws IOException {
     ObjectStream<POSSample> samples = createSampleStream();
     POSTaggerME.buildNGramDictionary(samples, 0);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void insufficientTestData() throws IOException {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class,
+        "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt");
+
+    ObjectStream<POSSample> stream = new WordTagSampleStream(
+        new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+ 
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name());
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    POSTaggerME.train("en", stream, params, new POSTaggerFactory());
+
+  }
+  
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
index 43d58294c..220650d20 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
@@ -26,6 +26,7 @@
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
@@ -42,12 +43,14 @@ public void testSentenceDetector() throws IOException {
         "/opennlp/tools/sentdetect/Sentences.txt");
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
 
     SentenceModel sentdetectModel = SentenceDetectorME.train(
         "en", new SentenceSampleStream(new PlainTextByLineStream(in,
-            StandardCharsets.UTF_8)), true, null, mlParams);
+            StandardCharsets.UTF_8)), factory, mlParams);
 
     Assert.assertEquals("en", sentdetectModel.getLanguage());
 
@@ -132,4 +135,23 @@ public void testSentenceDetector() throws IOException {
     Assert.assertEquals(new Span(16, 56), pos[1]);
 
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/sentdetect/SentencesInsufficient.txt");
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
+    
+    SentenceDetectorME.train("en", 
+        new SentenceSampleStream(
+            new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams);
+    
+  }
+  
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index 5a7a8119b..14b9185b7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -18,10 +18,18 @@
 package opennlp.tools.tokenize;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
 import org.junit.Test;
 
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
 /**
  * Tests for the {@link TokenizerME} class.
  *
@@ -65,4 +73,22 @@ public void testTokenizer() throws IOException {
     Assert.assertEquals("through", tokens[7]);
     Assert.assertEquals("!", tokens[8]);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    InputStreamFactory trainDataIn = new ResourceAsStreamFactory(
+        TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train");
+
+    ObjectStream<TokenSample> samples = new TokenSampleStream(
+        new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8));
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams);
+
+  }
+  
 }
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
new file mode 100644
index 000000000..a57859025
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
@@ -0,0 +1 @@
+Rockwell NNP B-NP
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
new file mode 100644
index 000000000..89c2aeea3
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
@@ -0,0 +1 @@
+The	DT	the
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
new file mode 100644
index 000000000..c70ec6d18
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
@@ -0,0 +1,5 @@
+Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years.
+I just knew his name , <START> Alan McKennedy <END> , and I'd heard the rumour that he'd moved to Scotland, the country of his ancestors.
+So I called <START> Julie <END> , a friend who's still in contact with him.
+She told me that he lived in 23213 Edinburgh, Worcesterstreet 12.
+I wrote him a letter right away and he answered soon, sounding very happy and delighted.
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
new file mode 100644
index 000000000..786f182d1
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
@@ -0,0 +1 @@
+Find_VB out_RP.
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
new file mode 100644
index 000000000..0465ce206
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
@@ -0,0 +1 @@
+Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years.
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
new file mode 100644
index 000000000..db4a49d2d
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
@@ -0,0 +1 @@
+I tried to find out the address of an old school .
\ No newline at end of file