Skip to content

Commit

Permalink
LUCENE-7626: IndexWriter no longer accepts broken offsets
Browse files Browse the repository at this point in the history
  • Loading branch information
mikemccand committed Jan 13, 2017
1 parent 5b3565e commit 64b8633
Show file tree
Hide file tree
Showing 19 changed files with 480 additions and 72 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ API Changes

Bug Fixes

* LUCENE-7626: IndexWriter will no longer accept broken token offsets
(Mike McCandless)

Improvements

* LUCENE-7489: Better storage of sparse doc-values fields with the default
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.miscellaneous;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

/**
* A filter to correct offsets that illegally go backwards.
*
* @deprecated Fix the token filters that create broken offsets in the first place.
*/
@Deprecated
public final class FixBrokenOffsetsFilter extends TokenFilter {

private int lastStartOffset;
private int lastEndOffset;

private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

public FixBrokenOffsetsFilter(TokenStream in) {
super(in);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken() == false) {
return false;
}
fixOffsets();
return true;
}

@Override
public void end() throws IOException {
super.end();
fixOffsets();
}

@Override
public void reset() throws IOException {
super.reset();
lastStartOffset = 0;
lastEndOffset = 0;
}

private void fixOffsets() {
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (startOffset < lastStartOffset) {
startOffset = lastStartOffset;
}
if (endOffset < startOffset) {
endOffset = startOffset;
}
offsetAtt.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
lastEndOffset = endOffset;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.miscellaneous;

import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;

/**
* Factory for {@link FixBrokenOffsetsFilter}.
*/
public class FixBrokenOffsetsFilterFactory extends TokenFilterFactory {

/** Sole constructor */
public FixBrokenOffsetsFilterFactory(Map<String,String> args) {
super(args);
}

@Override
public TokenStream create(TokenStream input) {
return new FixBrokenOffsetsFilter(input);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.miscellaneous;

import java.io.IOException;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;

public class TestFixBrokenOffsetsFilter extends BaseTokenStreamTestCase {

public void testBogusTermVectors() throws IOException {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(
new Token("bar", 5, 10), new Token("bar", 1, 4)
)));
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.SuppressForbidden;

/**
* Command-line tool that reads from a source index and
* writes to a dest index, correcting any broken offsets
* in the process.
*
* @lucene.experimental
*/
public class FixBrokenOffsets {
public SegmentInfos infos;

FSDirectory fsDir;

Path dir;

@SuppressForbidden(reason = "System.out required: command line tool")
public static void main(String[] args) throws IOException {
if (args.length < 2) {
System.err.println("Usage: FixBrokenOffsetse <srcDir> <destDir>");
return;
}
Path srcPath = Paths.get(args[0]);
if (!Files.exists(srcPath)) {
throw new RuntimeException("srcPath " + srcPath.toAbsolutePath() + " doesn't exist");
}
Path destPath = Paths.get(args[1]);
if (Files.exists(destPath)) {
throw new RuntimeException("destPath " + destPath.toAbsolutePath() + " already exists; please remove it and re-run");
}
Directory srcDir = FSDirectory.open(srcPath);
DirectoryReader reader = DirectoryReader.open(srcDir);

List<LeafReaderContext> leaves = reader.leaves();
CodecReader[] filtered = new CodecReader[leaves.size()];
for(int i=0;i<leaves.size();i++) {
filtered[i] = SlowCodecReaderWrapper.wrap(new FilterLeafReader(leaves.get(i).reader()) {
@Override
public Fields getTermVectors(int docID) throws IOException {
Fields termVectors = in.getTermVectors(docID);
if (termVectors == null) {
return null;
}
return new FilterFields(termVectors) {
@Override
public Terms terms(String field) throws IOException {
return new FilterTerms(super.terms(field)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(super.iterator()) {
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return new FilterPostingsEnum(super.postings(reuse, flags)) {
int nextLastStartOffset = 0;
int lastStartOffset = 0;

@Override
public int nextPosition() throws IOException {
int pos = super.nextPosition();
lastStartOffset = nextLastStartOffset;
nextLastStartOffset = startOffset();
return pos;
}

@Override
public int startOffset() throws IOException {
int offset = super.startOffset();
if (offset < lastStartOffset) {
offset = lastStartOffset;
}
return offset;
}

@Override
public int endOffset() throws IOException {
int offset = super.endOffset();
if (offset < lastStartOffset) {
offset = lastStartOffset;
}
return offset;
}
};
}
};
}
};
}
};
}
});
}

Directory destDir = FSDirectory.open(destPath);
IndexWriter writer = new IndexWriter(destDir, new IndexWriterConfig());
writer.addIndexes(filtered);
IOUtils.close(writer, reader, srcDir, destDir);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>Tools for handling backwards compatibility issues with indices.</title>
</head>
<body>
Tools for handling backwards compatibility issues with indices.
</body>
</html>

0 comments on commit 64b8633

Please sign in to comment.