Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-398: Java file format requires bitmaps of all 1's to be written… #222

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
bits.valueCount = fieldNode.getLength();
}

Expand Down
2 changes: 1 addition & 1 deletion java/vector/src/main/codegen/templates/UnionVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
this.valueCount = fieldNode.getLength();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.List;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.schema.ArrowFieldNode;

import io.netty.buffer.ArrowBuf;

Expand All @@ -29,13 +30,13 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf

protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this

public static void load(List<BufferBacked> vectors, List<ArrowBuf> buffers) {
public static void load(ArrowFieldNode fieldNode, List<BufferBacked> vectors, List<ArrowBuf> buffers) {
int expectedSize = vectors.size();
if (buffers.size() != expectedSize) {
throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size());
}
for (int i = 0; i < expectedSize; i++) {
vectors.get(i).load(buffers.get(i));
vectors.get(i).load(fieldNode, buffers.get(i));
}
}

Expand Down Expand Up @@ -106,7 +107,7 @@ public ArrowBuf getBuffer() {
}

@Override
public void load(ArrowBuf data) {
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
this.data.release();
this.data = data.retain(allocator);
}
Expand Down
36 changes: 36 additions & 0 deletions java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.BitHolder;
import org.apache.arrow.vector.holders.NullableBitHolder;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.types.Types.MinorType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.util.OversizedAllocationException;
Expand All @@ -48,6 +49,41 @@ public BitVector(String name, BufferAllocator allocator) {
super(name, allocator);
}

@Override
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
// When the vector is all nulls or all defined, the content of the buffer can be omitted
if (data.readableBytes() == 0 && fieldNode.getLength() != 0) {
data.release();
int count = fieldNode.getLength();
allocateNew(count);
int n = getSizeFromCount(count);
if (fieldNode.getNullCount() == 0) {
// all defined
// create an all 1s buffer
// set full bytes
int fullBytesCount = count / 8;
for (int i = 0; i < fullBytesCount; ++i) {
this.data.setByte(i, 0xFF);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the superfluous 1's in the last byte (when the length is not a multiple of 8) cause any problems? e.g. in #207 I noted that the test cases do not cover the case where the size is not a multiple of 8

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right they are going to cause problems if we keep appending to the vector afterwards.

}
int remainder = count % 8;
// set remaining bits
if (remainder > 0) {
byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7));;
this.data.setByte(fullBytesCount, bitMask);
}
} else if (fieldNode.getNullCount() == fieldNode.getLength()) {
// all null
// create an all 0s buffer
zeroVector();
} else {
throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined");
}
this.data.writerIndex(n);
} else {
super.load(fieldNode, data);
}
}

@Override
public Field getField() {
throw new UnsupportedOperationException("internal vector");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@
*/
package org.apache.arrow.vector;

import org.apache.arrow.vector.schema.ArrowFieldNode;

import io.netty.buffer.ArrowBuf;

/**
* Content is backed by a buffer and can be loaded/unloaded
*/
public interface BufferBacked {

void load(ArrowBuf data);
void load(ArrowFieldNode fieldNode, ArrowBuf data);

ArrowBuf unLoad();

Expand Down
17 changes: 0 additions & 17 deletions java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
*/
FieldReader getReader();

/**
* Get the metadata for this field. Used in serialization
*
* @return FieldMetadata for this field.
*/
// SerializedField getMetadata();

/**
* Returns the number of bytes that is used by this vector instance.
*/
Expand Down Expand Up @@ -166,16 +159,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
*/
ArrowBuf[] getBuffers(boolean clear);

/**
* Load the data provided in the buffer. Typically used when deserializing from the wire.
*
* @param metadata
* Metadata used to decode the incoming buffer.
* @param buffer
* The buffer that contains the ValueVector.
*/
// void load(SerializedField metadata, DrillBuf buffer);

/**
* An abstraction that is used to read from this vector instance.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ private void loadBuffers(FieldVector vector, Field field, Iterator<ArrowBuf> buf
try {
vector.loadFieldBuffers(fieldNode, ownBuffers);
} catch (RuntimeException e) {
e.printStackTrace();
throw new IllegalArgumentException("Could not load buffers for field " +
field + " error message" + e.getMessage(), e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public NullableMapVector(String name, BufferAllocator allocator, CallBack callBa

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
this.valueCount = fieldNode.getLength();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
*/
package org.apache.arrow.vector;

import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Collections;
import java.util.List;

import org.apache.arrow.memory.BufferAllocator;
Expand All @@ -29,12 +35,17 @@
import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter;
import org.apache.arrow.vector.complex.writer.BigIntWriter;
import org.apache.arrow.vector.complex.writer.IntWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Test;

import io.netty.buffer.ArrowBuf;

public class TestVectorUnloadLoad {

static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
Expand Down Expand Up @@ -88,6 +99,64 @@ public void testUnloadLoad() throws IOException {
}
}

/**
* The validity buffer can be empty if:
* - all values are defined
* - all values are null
* @throws IOException
*/
@Test
public void testLoadEmptyValidityBuffer() throws IOException {
Schema schema = new Schema(asList(
new Field("intDefined", true, new ArrowType.Int(32, true), Collections.<Field>emptyList()),
new Field("intNull", true, new ArrowType.Int(32, true), Collections.<Field>emptyList())
));
int count = 10;
ArrowBuf validity = allocator.getEmpty();
ArrowBuf values = allocator.buffer(count * 4); // integers
for (int i = 0; i < count; i++) {
values.setInt(i * 4, i);
}
try (
ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values, validity, values));
BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE);
VectorSchemaRoot newRoot = new VectorSchemaRoot(schema, finalVectorsAllocator);
) {

// load it
VectorLoader vectorLoader = new VectorLoader(newRoot);

vectorLoader.load(recordBatch);

NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined");
NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull");
for (int i = 0; i < count; i++) {
assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i));
assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i));
assertTrue("#" + i, intNullVector.getAccessor().isNull(i));
}
intDefinedVector.getMutator().setSafe(count + 10, 1234);
assertTrue(intDefinedVector.getAccessor().isNull(count + 1));
// empty slots should still default to unset
intDefinedVector.getMutator().setSafe(count + 1, 789);
assertFalse(intDefinedVector.getAccessor().isNull(count + 1));
assertEquals(789, intDefinedVector.getAccessor().get(count + 1));
assertTrue(intDefinedVector.getAccessor().isNull(count));
assertTrue(intDefinedVector.getAccessor().isNull(count + 2));
assertTrue(intDefinedVector.getAccessor().isNull(count + 3));
assertTrue(intDefinedVector.getAccessor().isNull(count + 4));
assertTrue(intDefinedVector.getAccessor().isNull(count + 5));
assertTrue(intDefinedVector.getAccessor().isNull(count + 6));
assertTrue(intDefinedVector.getAccessor().isNull(count + 7));
assertTrue(intDefinedVector.getAccessor().isNull(count + 8));
assertTrue(intDefinedVector.getAccessor().isNull(count + 9));
assertFalse(intDefinedVector.getAccessor().isNull(count + 10));
assertEquals(1234, intDefinedVector.getAccessor().get(count + 10));
} finally {
values.release();
}
}

public static VectorUnloader newVectorUnloader(FieldVector root) {
Schema schema = new Schema(root.getField().getChildren());
int valueCount = root.getAccessor().getValueCount();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.file.ArrowBlock;
import org.apache.arrow.vector.file.ArrowFooter;
import org.apache.arrow.vector.file.ArrowReader;
import org.apache.arrow.vector.file.ArrowWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.ArrowType;
Expand Down