-
Notifications
You must be signed in to change notification settings - Fork 4.1k
[Java] IPC Writers/readers are not always setting validity bits correctly #18741
Copy link
Copy link
Closed
Milestone
Description
When writing multiple batches to a Stream/File Writer, the first validity bit can get garbled between writing and reading. I couldn't pinpoint the exact issue, but I was able to re-create it with a fairly simple unit test.
in TestArrowStream.java:
@Test
public void testReadWriteMultipleBatches() throws IOException {
ByteArrayOutputStream os = new ByteArrayOutputStream();
try (IntVector vector = new IntVector("foo", allocator);) {
Schema schema = new Schema(Collections.singletonList(vector.getField()), null);
try (VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount());
ArrowStreamWriter writer = new ArrowStreamWriter(root, new MapDictionaryProvider(), Channels.newChannel(os));) {
writer.start();
vector.setNull(0);
vector.setSafe(1, 1);
vector.setSafe(2, 2);
vector.setNull(3);
vector.setSafe(4, 1);
vector.setValueCount(5);
root.setRowCount(5);
writer.writeBatch();
vector.setNull(0);
vector.setSafe(1, 1);
vector.setSafe(2, 2);
vector.setValueCount(3);
root.setRowCount(3);
writer.writeBatch();
}
}
ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray());
try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) {
IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0);
reader.loadNextBatch();
assertEquals(read.getValueCount(), 5);
assertNull(read.getObject(0));
assertEquals(read.getObject(1), Integer.valueOf(1));
assertEquals(read.getObject(2), Integer.valueOf(2));
assertNull(read.getObject(3));
assertEquals(read.getObject(4), Integer.valueOf(1));
reader.loadNextBatch();
assertEquals(read.getValueCount(), 3);
assertNull(read.getObject(0));
assertEquals(read.getObject(1), Integer.valueOf(1));
assertEquals(read.getObject(2), Integer.valueOf(2));
}
}in TestArrowFile.java:
@Test
public void testReadWriteMultipleBatches() throws IOException {
File file = new File("target/mytest_nulls_multibatch.arrow");
try (IntVector vector = new IntVector("foo", allocator);) {
Schema schema = new Schema(Collections.singletonList(vector.getField()), null);
try (FileOutputStream fileOutputStream = new FileOutputStream(file);
VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount());
ArrowFileWriter writer = new ArrowFileWriter(root, new MapDictionaryProvider(), fileOutputStream.getChannel());) {
writer.start();
vector.setNull(0);
vector.setSafe(1, 1);
vector.setSafe(2, 2);
vector.setNull(3);
vector.setSafe(4, 1);
vector.setValueCount(5);
root.setRowCount(5);
writer.writeBatch();
vector.setNull(0);
vector.setSafe(1, 1);
vector.setSafe(2, 2);
vector.setValueCount(3);
root.setRowCount(3);
writer.writeBatch();
}
}
try (FileInputStream fileInputStream = new FileInputStream(file);
ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator);) {
IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0);
reader.loadNextBatch();
assertEquals(read.getValueCount(), 5);
assertNull(read.getObject(0));
assertEquals(read.getObject(1), Integer.valueOf(1));
assertEquals(read.getObject(2), Integer.valueOf(2));
assertNull(read.getObject(3));
assertEquals(read.getObject(4), Integer.valueOf(1));
reader.loadNextBatch();
assertEquals(read.getValueCount(), 3);
assertNull(read.getObject(0));
assertEquals(read.getObject(1), Integer.valueOf(1));
assertEquals(read.getObject(2), Integer.valueOf(2));
}
}Reporter: Emilio Lahr-Vivaz / @elahrvivaz
Assignee: Bo Meng / @bomeng
PRs and other links:
Note: This issue was originally created as ARROW-2500. Please see the migration documentation for further details.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Type
Fields
Give feedbackNo fields configured for issues without a type.