From 8b493e4ad219dd811143d9858ecc2e104662a8e7 Mon Sep 17 00:00:00 2001 From: smarthi Date: Sat, 15 Apr 2017 13:11:46 -0400 Subject: [PATCH] OPENNLP-1022: Fix documentation to remove references to 'Save XXXModel to database' --- opennlp-docs/src/docbkx/chunker.xml | 29 ++------- opennlp-docs/src/docbkx/doccat.xml | 44 ++----------- opennlp-docs/src/docbkx/introduction.xml | 17 +---- opennlp-docs/src/docbkx/lemmatizer.xml | 38 +++--------- opennlp-docs/src/docbkx/namefinder.xml | 36 +++-------- opennlp-docs/src/docbkx/parser.xml | 2 +- opennlp-docs/src/docbkx/postagger.xml | 62 ++----------------- opennlp-docs/src/docbkx/sentdetect.xml | 33 ++-------- opennlp-docs/src/docbkx/tokenizer.xml | 15 +---- .../java/opennlp/tools/ml/BeamSearch.java | 23 +++---- 10 files changed, 46 insertions(+), 253 deletions(-) diff --git a/opennlp-docs/src/docbkx/chunker.xml b/opennlp-docs/src/docbkx/chunker.xml index 0c04e8a47..b67a7fdc7 100644 --- a/opennlp-docs/src/docbkx/chunker.xml +++ b/opennlp-docs/src/docbkx/chunker.xml @@ -81,19 +81,8 @@ Rockwell_NNP said_VBD the_DT agreement_NN calls_VBZ for_IN it_PRP to_TO supply_V InputStream modelIn = null; ChunkerModel model = null; -try { - modelIn = new FileInputStream("en-chunker.bin"); +try (modelIn = new FileInputStream("en-chunker.bin")){ model = new ChunkerModel(modelIn); -} catch (IOException e) { - // Model loading failed, handle the error - e.printStackTrace(); -} finally { - if (modelIn != null) { - try { - modelIn.close(); - } catch (IOException e) { - } - } }]]> After the model is loaded a Chunker can be instantiated. @@ -242,28 +231,18 @@ $ opennlp ChunkerTrainerME -model en-chunker.bin -lang en -data en-chunker.train illustrates how to do it: lineStream = - new PlainTextByLineStream(new FileInputStream("en-chunker.train"),charset); -ObjectStream sampleStream = new ChunkSampleStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-chunker.train"), StandardCharsets.UTF_8); ChunkerModel model; -try { +try(ObjectStream sampleStream = new ChunkSampleStream(lineStream)) { model = ChunkerME.train("en", sampleStream, new DefaultChunkerContextGenerator(), TrainingParameters.defaultParams()); } -finally { - sampleStream.close(); -} -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> diff --git a/opennlp-docs/src/docbkx/doccat.xml b/opennlp-docs/src/docbkx/doccat.xml index 7fe3f1fd9..c056732eb 100644 --- a/opennlp-docs/src/docbkx/doccat.xml +++ b/opennlp-docs/src/docbkx/doccat.xml @@ -127,33 +127,16 @@ $ opennlp DoccatTrainer -model en-doccat.bin -lang en -data en-doccat.train -enc lineStream = new PlainTextByLineStream(dataIn, "UTF-8"); ObjectStream sampleStream = new DocumentSampleStream(lineStream); model = DocumentCategorizerME.train("en", sampleStream); } -catch (IOException e) { - // Failed to read or parse training data, training failed - e.printStackTrace(); -} -finally { - if (dataIn != null) { - try { - dataIn.close(); - } - catch (IOException e) { - // Not an issue, training already finished. - // The exception should be logged and investigated - // if part of a production system. - e.printStackTrace(); - } - } -}]]> +]]> Now might be a good time to cruise over to Hulu or something, because this could take a while if you've got a large training set. You may see a lot of output as well. Once you're done, you can pretty quickly step to classification directly, @@ -162,27 +145,10 @@ finally { +]]> diff --git a/opennlp-docs/src/docbkx/introduction.xml b/opennlp-docs/src/docbkx/introduction.xml index a3bd48225..65fcd9dfa 100644 --- a/opennlp-docs/src/docbkx/introduction.xml +++ b/opennlp-docs/src/docbkx/introduction.xml @@ -65,23 +65,10 @@ under the License. constructor of the model class: +]]> diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml index 34668d0c8..1fa554000 100644 --- a/opennlp-docs/src/docbkx/lemmatizer.xml +++ b/opennlp-docs/src/docbkx/lemmatizer.xml @@ -88,22 +88,11 @@ signed VBD sign In the example below it is loaded from disk: +} +]]> After the model is loaded a LemmatizerME can be instantiated. @@ -174,22 +163,10 @@ shrapnel NN shrapnel +]]> After the dictionary is loaded the DictionaryLemmatizer can be instantiated. @@ -303,8 +280,7 @@ $ opennlp LemmatizerTrainerME -model en-lemmatizer.bin -params PerceptronTrainer TrainingParameters mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); - } -]]> + }]]> Then we read the training data: diff --git a/opennlp-docs/src/docbkx/namefinder.xml b/opennlp-docs/src/docbkx/namefinder.xml index 1e72a8239..2f68c4739 100644 --- a/opennlp-docs/src/docbkx/namefinder.xml +++ b/opennlp-docs/src/docbkx/namefinder.xml @@ -80,23 +80,10 @@ Mr . Vinken is chairman of Elsevier N.V. , the Dutch publis In the sample below it is loaded from disk. +]]> There is a number of reasons why the model loading can fail: @@ -274,33 +261,24 @@ $ opennlp TokenNameFinderTrainer -featuregen brown.xml -sequenceCodec BILOU -res Call the NameFinderME.train method - Save the TokenNameFinderModel to a file or database + Save the TokenNameFinderModel to a file The three steps are illustrated by the following sample code: lineStream = - new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), charset); -ObjectStream sampleStream = new NameSampleDataStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), StandardCharsets.UTF8); TokenNameFinderModel model; -try { +try (ObjectStream sampleStream = new NameSampleDataStream(lineStream)) { model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(), TokenNameFinderFactory nameFinderFactory); } -finally { - sampleStream.close(); -} -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)){ model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> @@ -542,7 +520,7 @@ System.out.println(result.toString());]]> sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), "UTF-8"); +ObjectStream sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), StandardCharsets.UTF_8); TokenNameFinderCrossValidator evaluator = new TokenNameFinderCrossValidator("en", 100, 5); evaluator.evaluate(sampleStream, 10); diff --git a/opennlp-docs/src/docbkx/parser.xml b/opennlp-docs/src/docbkx/parser.xml index a81c07893..614293b96 100644 --- a/opennlp-docs/src/docbkx/parser.xml +++ b/opennlp-docs/src/docbkx/parser.xml @@ -218,7 +218,7 @@ $ opennlp TaggerModelReplacer en-parser-chunking.bin en-pos-maxent.bin]]> Call a Parser train method: This can be either the CHUNKING or the TREEINSERT parser. - Save the ParseModel to a file or database. + Save the ParseModel to a file The following code snippet shows how to instantiate the HeadRules: diff --git a/opennlp-docs/src/docbkx/postagger.xml b/opennlp-docs/src/docbkx/postagger.xml index e981c3a25..b623d2eeb 100644 --- a/opennlp-docs/src/docbkx/postagger.xml +++ b/opennlp-docs/src/docbkx/postagger.xml @@ -69,24 +69,8 @@ Mr._NNP Vinken_NNP is_VBZ chairman_NN of_IN Elsevier_NNP N.V._NNP ,_, the_DT Dut In the sample below its loaded from disk. After the model is loaded the POSTaggerME can be instantiated. @@ -214,7 +198,7 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \ Call the POSTagger.train method - Save the POSModel to a file or database + Save the POSModel to a file The following code illustrates that: @@ -222,30 +206,11 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \ lineStream = new PlainTextByLineStream(dataIn, "UTF-8"); +try (InputStream dataIn = new FileInputStream("en-pos.train")){ + ObjectStream lineStream = new PlainTextByLineStream(dataIn, StandardCharsets.UTF_8); ObjectStream sampleStream = new WordTagSampleStream(lineStream); model = POSTaggerME.train("en", sampleStream, TrainingParameters.defaultParams(), null, null); -} -catch (IOException e) { - // Failed to read or parse training data, training failed - e.printStackTrace(); -} -finally { - if (dataIn != null) { - try { - dataIn.close(); - } - catch (IOException e) { - // Not an issue, training already finished. - // The exception should be logged and investigated - // if part of a production system. - e.printStackTrace(); - } - } }]]> The above code performs the first two steps, opening the data and training @@ -253,25 +218,8 @@ finally { the sample below it is written into a file. diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml index 0c67b51c8..aacd4d389 100644 --- a/opennlp-docs/src/docbkx/sentdetect.xml +++ b/opennlp-docs/src/docbkx/sentdetect.xml @@ -81,22 +81,9 @@ $ opennlp SentenceDetector en-sent.bin < input.txt > output.txt]]> To instantiate the Sentence Detector the sentence model must be loaded first. After the model is loaded the SentenceDetectorME can be instantiated. @@ -123,7 +110,7 @@ Span sentences[] = sentenceDetector.sentPosDetect(" First sentence. Second sent
Sentence Detector Training - +
Training Tool @@ -220,27 +207,17 @@ Path: en-sent.bin The following sample code illustrates these steps: lineStream = - new PlainTextByLineStream(new FileInputStream("en-sent.train"), charset); -ObjectStream sampleStream = new SentenceSampleStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-sent.train"), StandardCharsets.UTF_8); SentenceModel model; -try { +try (ObjectStream sampleStream = new SentenceSampleStream(lineStream)) { model = SentenceDetectorME.train("en", sampleStream, true, null, TrainingParameters.defaultParams()); } -finally { - sampleStream.close(); -} -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> diff --git a/opennlp-docs/src/docbkx/tokenizer.xml b/opennlp-docs/src/docbkx/tokenizer.xml index d8df4771a..6d54c3c40 100644 --- a/opennlp-docs/src/docbkx/tokenizer.xml +++ b/opennlp-docs/src/docbkx/tokenizer.xml @@ -154,22 +154,9 @@ London share prices were bolstered largely by continued gains on Wall Street and can be loaded. After the model is loaded the TokenizerME can be instantiated. diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java index 949a40826..7987b9f0a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java +++ b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java @@ -105,13 +105,8 @@ public Sequence[] bestSequences(int numSequences, T[] sequence, String[] contexts = cg.getContext(i, sequence, outcomes, additionalContext); double[] scores; if (contextsCache != null) { - scores = contextsCache.get(contexts); - if (scores == null) { - scores = model.eval(contexts, probs); - contextsCache.put(contexts,scores); - } - } - else { + scores = contextsCache.computeIfAbsent(contexts, c -> model.eval(c, probs)); + } else { scores = model.eval(contexts, probs); } @@ -123,13 +118,13 @@ public Sequence[] bestSequences(int numSequences, T[] sequence, double min = temp_scores[Math.max(0,scores.length - size)]; for (int p = 0; p < scores.length; p++) { - if (scores[p] < min) - continue; //only advance first "size" outcomes - String out = model.getOutcome(p); - if (validator.validSequence(i, sequence, outcomes, out)) { - Sequence ns = new Sequence(top, out, scores[p]); - if (ns.getScore() > minSequenceScore) { - next.add(ns); + if (scores[p] >= min) { + String out = model.getOutcome(p); + if (validator.validSequence(i, sequence, outcomes, out)) { + Sequence ns = new Sequence(top, out, scores[p]); + if (ns.getScore() > minSequenceScore) { + next.add(ns); + } } } }