Skip to content

Commit

Permalink
LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
Browse files Browse the repository at this point in the history
  • Loading branch information
romseygeek committed Jan 28, 2019
1 parent f543b4e commit 7713a4f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 0 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,10 @@ Bug fixes:
* LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside
the triangle. (Ignacio Vera)

* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and
was not propagating final position increments from its child streams correctly.
(Dan Meehl, Alan Woodward)

New Features

* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IOUtils;
Expand All @@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream {

private final TokenStream[] sources;
private final OffsetAttribute[] sourceOffsets;
private final PositionIncrementAttribute[] sourceIncrements;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncAtt;

private int currentSource;
private int offsetIncrement;
private int initialPositionIncrement = 1;

/**
* Create a new ConcatenatingTokenStream from a set of inputs
Expand All @@ -52,9 +56,12 @@ public ConcatenatingTokenStream(TokenStream... sources) {
super(combineSources(sources));
this.sources = sources;
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
this.sourceOffsets = new OffsetAttribute[sources.length];
this.sourceIncrements = new PositionIncrementAttribute[sources.length];
for (int i = 0; i < sources.length; i++) {
this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
}
}

Expand All @@ -78,27 +85,38 @@ private static AttributeSource combineSources(TokenStream... sources) {

@Override
public boolean incrementToken() throws IOException {
boolean newSource = false;
while (sources[currentSource].incrementToken() == false) {
if (currentSource >= sources.length - 1)
return false;
sources[currentSource].end();
initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
OffsetAttribute att = sourceOffsets[currentSource];
if (att != null)
offsetIncrement += att.endOffset();
currentSource++;
newSource = true;
}

clearAttributes();
sources[currentSource].copyTo(this);
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
if (newSource) {
int posInc = posIncAtt.getPositionIncrement();
posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
}

return true;
}

@Override
public void end() throws IOException {
sources[currentSource].end();
int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement;
int finalPosInc = sourceIncrements[currentSource].getPositionIncrement();
super.end();
offsetAtt.setOffset(finalOffset, finalOffset);
posIncAtt.setPositionIncrement(finalPosInc);
}

@Override
Expand All @@ -107,6 +125,8 @@ public void reset() throws IOException {
source.reset();
}
super.reset();
currentSource = 0;
offsetIncrement = 0;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
import java.io.StringReader;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
Expand All @@ -46,6 +48,33 @@ public void testBasic() throws IOException {
new int[]{ 0, 6, 12, 19, 25, 31 },
new int[]{ 5, 11, 18, 24, 30, 36 });

// test re-use
first.setReader(new StringReader("first words "));
second.setReader(new StringReader("second words"));
third.setReader(new StringReader(" third words"));
assertTokenStreamContents(ts,
new String[] { "first", "words", "second", "words", "third", "words" },
new int[]{ 0, 6, 12, 19, 25, 31 },
new int[]{ 5, 11, 18, 24, 30, 36 },
new int[]{ 1, 1, 1, 1, 1, 1 });

}

public void testOffsetGaps() throws IOException {
CannedTokenStream cts1 = new CannedTokenStream(2, 10,
new Token("a", 0, 1), new Token("b", 2, 3));
CannedTokenStream cts2 = new CannedTokenStream(2, 10,
new Token("c", 0, 1), new Token("d", 2, 3));

TokenStream ts = new ConcatenatingTokenStream(cts1, cts2);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "d" },
new int[]{ 0, 2, 10, 12 },
new int[]{ 1, 3, 11, 13 },
null,
new int[]{ 1, 1, 3, 1 },
null, 20, 2, null, false, null
);
}

public void testInconsistentAttributes() throws IOException {
Expand Down

0 comments on commit 7713a4f

Please sign in to comment.