Skip to content

Commit

Permalink
ORC-1362: [Java] Enable ENABLE_INDEXES (#1387)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Enable the functionality behind `orc.create.index`.

### Why are the changes needed?
In #1373, we disabled the configuration "orc.create.index" because the orc reader cannot work well when orc.create.index=false.

During the write of a file with `orc.create.index=false` and `orc.row.index.stride > 0` no indexes are created in the ORC file. The `orc.row.index.stride` is persisted into the file metadata. During a read that requires the use of indexes e.g. in the presence of search arguments we get a read failure as the reader was expecting indexes based on the value of `orc.row.index.stride` but none exist.

Two changes are proposed here:
* Enable consistent writes i.e. assume orc.row.index.stride=0 if orc.create.index=false or if orc.row.index.stride >0 then assume orc.create.index=true
* Enable the reader to handle the absence of row index information. During the process of selecting row groups in a stripe if the index information is seen as missing then we assume that all row groups should be read as the index information is missing.

### How was this patch tested?
UT
  • Loading branch information
deshanxiao committed Feb 20, 2023
1 parent e6a9fe2 commit 7f90bbd
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 6 deletions.
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions java/core/src/java/org/apache/orc/OrcFile.java
Expand Up @@ -564,6 +564,21 @@ public WriterOptions blockSize(long value) {
*/
public WriterOptions rowIndexStride(int value) {
rowIndexStrideValue = value;
if (rowIndexStrideValue <= 0) {
buildIndex = false;
}
return this;
}

/**
* Sets whether build the index. The default value is true. If the value is
* set to false, rowIndexStrideValue will be set to zero.
*/
public WriterOptions buildIndex(boolean value) {
buildIndex = value;
if (!buildIndex) {
rowIndexStrideValue = 0;
}
return this;
}

Expand Down
3 changes: 2 additions & 1 deletion java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
Expand Up @@ -1176,7 +1176,8 @@ public boolean[] pickRowGroups(StripeInformation stripe,
leafValues[pred] = exceptionAnswer[pred];
} else {
if (indexes[columnIx] == null) {
throw new AssertionError("Index is not populated for " + columnIx);
LOG.warn("Index is not populated for " + columnIx);
return READ_ALL_RGS;
}
OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup);
if (entry == null) {
Expand Down
5 changes: 3 additions & 2 deletions java/core/src/java/org/apache/orc/impl/WriterImpl.java
Expand Up @@ -183,14 +183,15 @@ public WriterImpl(FileSystem fs,
this.encodingStrategy = opts.getEncodingStrategy();
this.compressionStrategy = opts.getCompressionStrategy();

if (opts.getRowIndexStride() >= 0) {
// ORC-1362: if isBuildIndex=false, then rowIndexStride will be set to 0.
if (opts.getRowIndexStride() >= 0 && opts.isBuildIndex()) {
this.rowIndexStride = opts.getRowIndexStride();
} else {
this.rowIndexStride = 0;
}

// ORC-1343: We ignore `opts.isBuildIndex` due to the lack of reader support
this.buildIndex = rowIndexStride > 0;

if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
Expand Down
37 changes: 37 additions & 0 deletions java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
Expand Up @@ -527,4 +527,41 @@ public void testWithoutCompressionBlockSize() throws IOException {
}
}
}

@Test
public void testSargSkipPickupGroupWithoutIndex() throws IOException {
Configuration conf = new Configuration();
// We use ORC files in two languages to test, the previous Java version could not work
// well when orc.row.index.stride > 0 and orc.create.index=false, now it can skip these row groups.
Path[] paths = new Path[] {
// Writen by C++ API with schema struct<x:int,y:string> orc.row.index.stride=0
new Path(workDir, "TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc"),
// Writen by old Java API with schema struct<x:int,y:string> orc.row.index.stride=1000,orc.create.index=false
new Path(workDir, "TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc"),
};
for (Path path: paths) {
FileSystem fs = path.getFileSystem(conf);
try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
OrcFile.readerOptions(conf).filesystem(fs))) {

SearchArgument sarg = SearchArgumentFactory.newBuilder()
.startNot()
.lessThan("x", PredicateLeaf.Type.LONG, 100L)
.end().build();

try (RecordReader rows = reader.rows(reader.options().searchArgument(sarg, new String[]{"x"}))) {
TypeDescription schema = reader.getSchema();
assertEquals("struct<x:int,y:string>", schema.toString());
VectorizedRowBatch batch = schema.createRowBatchV2();
assertTrue(rows.nextBatch(batch), "No rows read out!");
assertEquals(1024, batch.size);
LongColumnVector col1 = (LongColumnVector) batch.cols[0];
for (int i = 0; i < batch.size; ++i) {
assertEquals(i, col1.vector[i]);
}
assertTrue(rows.nextBatch(batch));
}
}
}
}
}
18 changes: 15 additions & 3 deletions java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
Expand Up @@ -32,13 +32,13 @@
import org.apache.orc.*;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import java.io.IOException;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertFalse;

public class TestWriterImpl {

Expand Down Expand Up @@ -94,11 +94,10 @@ public void testNoBFIfNoIndex() throws Exception {
w.close();
}

@Disabled("ORC-1343: Disable ENABLE_INDEXES tests until reader supports it properly")
@Test
public void testNoIndexIfEnableIndexIsFalse() throws Exception {
conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000");
conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "0");
conf.setBoolean(OrcConf.ENABLE_INDEXES.getAttribute(), false);
VectorizedRowBatch b = schema.createRowBatch();
LongColumnVector f1 = (LongColumnVector) b.cols[0];
Expand All @@ -121,6 +120,19 @@ public void testNoIndexIfEnableIndexIsFalse() throws Exception {
}
}

@Test
public void testEnableDisableIndex() {
conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "10000");
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf);
writerOptions.buildIndex(false);
assertEquals(writerOptions.getRowIndexStride(), 0);

conf.set(OrcConf.ENABLE_INDEXES.getAttribute(), "true");
OrcFile.WriterOptions writerOptions2 = OrcFile.writerOptions(conf);
writerOptions2.rowIndexStride(0);
assertFalse(writerOptions2.isBuildIndex());
}

@Test
public void testStripes() throws Exception {
conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
Expand Down

0 comments on commit 7f90bbd

Please sign in to comment.