From 22955b5b2368b17aab97d032024f2c5e4f55c625 Mon Sep 17 00:00:00 2001 From: William D C M SILVA Date: Mon, 2 Jan 2017 23:18:11 -0200 Subject: [PATCH] Adds cache to Chunker See issue OPENNLP-274 --- .../opennlp/tools/chunker/ChunkerFactory.java | 4 ++ .../java/opennlp/tools/chunker/ChunkerME.java | 6 +-- .../DefaultChunkerContextGenerator.java | 51 +++++++++++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java index 1cb772fc1..132328444 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java @@ -60,4 +60,8 @@ public SequenceValidator getSequenceValidator() { public ChunkerContextGenerator getContextGenerator() { return new DefaultChunkerContextGenerator(); } + + public ChunkerContextGenerator getContextGenerator(int cache) { + return new DefaultChunkerContextGenerator(cache); + } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java index 3ed4f9c2c..3e0272ff0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java @@ -94,7 +94,7 @@ private ChunkerME(ChunkerModel model, int beamSize, SequenceValidator se @Deprecated private ChunkerME(ChunkerModel model, int beamSize) { - contextGenerator = model.getFactory().getContextGenerator(); + contextGenerator = model.getFactory().getContextGenerator(beamSize); sequenceValidator = model.getFactory().getSequenceValidator(); if (model.getChunkerSequenceModel() != null) { @@ -177,7 +177,7 @@ public static ChunkerModel train(String lang, ObjectStream in, SequenceClassificationModel seqChunkerModel = null; if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) { - ObjectStream es = new ChunkerEventStream(in, factory.getContextGenerator()); + ObjectStream es = new ChunkerEventStream(in, factory.getContextGenerator(beamSize)); EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams.getSettings(), manifestInfoEntries); chunkerModel = trainer.train(es); @@ -188,7 +188,7 @@ else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) { // TODO: This will probably cause issue, since the feature generator uses the outcomes array - ChunkSampleSequenceStream ss = new ChunkSampleSequenceStream(in, factory.getContextGenerator()); + ChunkSampleSequenceStream ss = new ChunkSampleSequenceStream(in, factory.getContextGenerator(beamSize)); seqChunkerModel = trainer.train(ss); } else { diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java index 3bf4ba452..728049c6a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java @@ -18,16 +18,38 @@ package opennlp.tools.chunker; +import java.util.Objects; + +import opennlp.tools.util.Cache; + /** Features based on chunking model described in Fei Sha and Fernando Pereira. Shallow * parsing with conditional random fields. In Proceedings of HLT-NAACL 2003. Association * for Computational Linguistics, 2003. */ public class DefaultChunkerContextGenerator implements ChunkerContextGenerator { + private Cache contextsCache; + private Object wordsKey; + + protected static final String SB = "*SB*"; + /** * Creates the default context generator a chunker. */ public DefaultChunkerContextGenerator() { + this(0); + } + + + /** + * Creates the default context generator a chunker. + * + * @param cacheSize + */ + public DefaultChunkerContextGenerator(int cacheSize) { + if (cacheSize > 0) { + contextsCache = new Cache<>(cacheSize); + } } public String[] getContext(int index, String[] sequence, String[] priorDecisions, Object[] additionalContext) { @@ -35,6 +57,8 @@ public String[] getContext(int index, String[] sequence, String[] priorDecisions } public String[] getContext(int i, String[] toks, String[] tags, String[] preds) { + String predprev = null, predprevprev = null; + // Words in a 5-word window String w_2, w_1, w0, w1, w2; @@ -43,6 +67,30 @@ public String[] getContext(int i, String[] toks, String[] tags, String[] preds) // Previous predictions String p_2, p_1; + + String cacheKey = null; + if (contextsCache != null) { + if (i - 1 >= 0) { + predprev = preds[i - 1]; + + if (i - 2 >= 0) { + predprevprev = preds[i - 2]; + } + } + + cacheKey = i + predprev + predprevprev; + if (Objects.equals(wordsKey, toks)) { + // same sentence + String[] cachedContexts = contextsCache.get(cacheKey); + if (cachedContexts != null) { + return cachedContexts; + } + } else { + // new sentence + contextsCache.clear(); + wordsKey = toks; + } + } if (i < 2) { w_2 = "w_2=bos"; @@ -140,6 +188,9 @@ public String[] getContext(int i, String[] toks, String[] tags, String[] preds) p_1 + w0 + w1 }; + if (contextsCache != null) { + contextsCache.put(cacheKey,features); + } return features; } }