diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp index 053eeaa48..51be5b75b 100644 --- a/src/translator/sentence_ranges.cpp +++ b/src/translator/sentence_ranges.cpp @@ -6,48 +6,38 @@ namespace marian { namespace bergamot { void Annotation::addSentence(std::vector &sentence) { - size_t size = flatByteRanges_.size(); flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence), std::end(sentence)); - sentenceBeginIds_.push_back(size); + size_t size = flatByteRanges_.size(); + sentenceEndIds_.push_back(size); } size_t Annotation::numWords(size_t sentenceIdx) const { auto terminals = sentenceTerminalIds(sentenceIdx); - return terminals.second - terminals.first + 1; + return terminals.second - terminals.first; } std::pair Annotation::sentenceTerminalIds(size_t sentenceIdx) const { size_t bosId, eosId; - bosId = sentenceBeginIds_[sentenceIdx]; - eosId = sentenceIdx + 1 < numSentences() - ? sentenceBeginIds_[sentenceIdx + 1] - 1 - : flatByteRanges_.size() - 1; + bosId = (sentenceIdx == 0) + ? 0 // Avoid -1 access + : sentenceEndIds_[sentenceIdx - 1]; // Half interval, so; - // Out of bound checks. - assert(bosId < flatByteRanges_.size()); - assert(eosId < flatByteRanges_.size()); + eosId = sentenceEndIds_[sentenceIdx]; return std::make_pair(bosId, eosId); } -std::pair -Annotation::sentenceTerminals(size_t sentenceIdx) const { - auto terminals = sentenceTerminalIds(sentenceIdx); - return std::make_pair(flatByteRanges_[terminals.first], - flatByteRanges_[terminals.second]); -} - ByteRange Annotation::sentence(size_t sentenceIdx) const { - auto terminals = sentenceTerminals(sentenceIdx); - return (ByteRange){terminals.first.begin, terminals.second.end}; + auto terminals = sentenceTerminalIds(sentenceIdx); + auto bos = flatByteRanges_[terminals.first]; + auto eos = flatByteRanges_[terminals.second - 1]; + return (ByteRange){bos.begin, eos.end}; } ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const { - size_t offset = sentenceBeginIds_[sentenceIdx]; - // auto terminals = sentenceTerminals(sentenceIdx); - // assert(offset + wordIdx <= terminals.second); - return flatByteRanges_[offset + wordIdx]; + size_t bosOffset = (sentenceIdx == 0) ? 0 : sentenceEndIds_[sentenceIdx - 1]; + return flatByteRanges_[bosOffset + wordIdx]; } string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const { diff --git a/src/translator/sentence_ranges.h b/src/translator/sentence_ranges.h index a0dc8c9a9..8f9bf3c26 100644 --- a/src/translator/sentence_ranges.h +++ b/src/translator/sentence_ranges.h @@ -28,7 +28,7 @@ class Annotation { Annotation() {} /// Returns the number of sentences annotated in a text. - size_t numSentences() const { return sentenceBeginIds_.size(); } + size_t numSentences() const { return sentenceEndIds_.size(); } /// Returns number of words in the sentece identified by sentenceIdx. size_t numWords(size_t sentenceIdx) const; @@ -46,18 +46,13 @@ class Annotation { private: /// A flat storage for ByteRanges. Composed of word ByteRanges, extra - /// information in sentenceBeginIds_ to denote sentence boundary markers as + /// information in sentenceEndIds_ to denote sentence boundary markers as /// indices. std::vector flatByteRanges_; - /// Stores indices where sentences begin - std::vector sentenceBeginIds_; - - /// Returns ByteRanges corresponding to beginning and end words of sentence - /// corresponding to sentenceIdx. This is useful in using the information to - /// construct a ByteRange of a sentence taking the begin from the first and - /// end from the second. - std::pair sentenceTerminals(size_t sentenceIdx) const; + /// Stores indices where sentences end (not inclusive, aligned with C++ half + /// interval notions) + std::vector sentenceEndIds_; /// Returns indices of terminal (word) ByteRanges in sentenceIds_ of a /// sentence corresponding to sentenceIdx. The distance can be used to compute