Skip to content
Permalink
Browse files

revert [SPARK-22785][SQL] remove ColumnVector.anyNullsSet

## What changes were proposed in this pull request?

In #19980 , we thought `anyNullsSet` can be simply implemented by `numNulls() > 0`. This is logically true, but may have performance problems.

`OrcColumnVector` is an example. It doesn't have the `numNulls` property, only has a `noNulls` property. We will lose a lot of performance if we use `numNulls() > 0` to check null.

This PR simply revert #19980, with a renaming to call it `hasNull`. Better name suggestions are welcome, e.g. `nullable`?

## How was this patch tested?

existing test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20452 from cloud-fan/null.

(cherry picked from commit 48dd6a4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information...
cloud-fan committed Jan 31, 2018
1 parent c83246c commit 33f17b28b3448a6f0389d2ee93bb5d49a02f288c
@@ -77,6 +77,11 @@ public void close() {

}

@Override
public boolean hasNull() {
return !baseData.noNulls;
}

@Override
public int numNulls() {
if (baseData.isRepeating) {
@@ -123,7 +123,7 @@ public void putNulls(int rowId, int count) {

@Override
public void putNotNulls(int rowId, int count) {
if (numNulls == 0) return;
if (!hasNull()) return;
long offset = nulls + rowId;
for (int i = 0; i < count; ++i, ++offset) {
Platform.putByte(null, offset, (byte) 0);
@@ -119,7 +119,7 @@ public void putNulls(int rowId, int count) {

@Override
public void putNotNulls(int rowId, int count) {
if (numNulls == 0) return;
if (!hasNull()) return;
for (int i = 0; i < count; ++i) {
nulls[rowId + i] = (byte)0;
}
@@ -59,8 +59,8 @@ public void reset() {
elementsAppended = 0;
if (numNulls > 0) {
putNotNulls(0, capacity);
numNulls = 0;
}
numNulls = 0;
}

@Override
@@ -102,6 +102,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
throw new RuntimeException(message, cause);
}

@Override
public boolean hasNull() {
return numNulls > 0;
}

@Override
public int numNulls() { return numNulls; }

@@ -37,6 +37,11 @@
private final ArrowVectorAccessor accessor;
private ArrowColumnVector[] childColumns;

@Override
public boolean hasNull() {
return accessor.getNullCount() > 0;
}

@Override
public int numNulls() {
return accessor.getNullCount();
@@ -65,6 +65,11 @@
@Override
public abstract void close();

/**
* Returns true if this column vector contains any null values.
*/
public abstract boolean hasNull();

/**
* Returns the number of nulls in this column vector.
*/
@@ -42,6 +42,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === BooleanType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -69,6 +70,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ByteType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -96,6 +98,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ShortType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -123,6 +126,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === IntegerType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -150,6 +154,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === LongType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -177,6 +182,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === FloatType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -204,6 +210,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === DoubleType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -232,6 +239,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === StringType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -258,6 +266,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === BinaryType)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

(0 until 10).foreach { i =>
@@ -300,6 +309,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ArrayType(IntegerType))
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

val array0 = columnVector.getArray(0)
@@ -344,6 +354,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === schema)
assert(!columnVector.hasNull)
assert(columnVector.numNulls === 0)

val row0 = columnVector.getStruct(0)
@@ -396,6 +407,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {

val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === schema)
assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)

val row0 = columnVector.getStruct(0)
@@ -66,41 +66,49 @@ class ColumnarBatchSuite extends SparkFunSuite {
column =>
val reference = mutable.ArrayBuffer.empty[Boolean]
var idx = 0
assert(!column.hasNull)
assert(column.numNulls() == 0)

column.appendNotNull()
reference += false
assert(!column.hasNull)
assert(column.numNulls() == 0)

column.appendNotNulls(3)
(1 to 3).foreach(_ => reference += false)
assert(!column.hasNull)
assert(column.numNulls() == 0)

column.appendNull()
reference += true
assert(column.hasNull)
assert(column.numNulls() == 1)

column.appendNulls(3)
(1 to 3).foreach(_ => reference += true)
assert(column.hasNull)
assert(column.numNulls() == 4)

idx = column.elementsAppended

column.putNotNull(idx)
reference += false
idx += 1
assert(column.hasNull)
assert(column.numNulls() == 4)

column.putNull(idx)
reference += true
idx += 1
assert(column.hasNull)
assert(column.numNulls() == 5)

column.putNulls(idx, 3)
reference += true
reference += true
reference += true
idx += 3
assert(column.hasNull)
assert(column.numNulls() == 8)

column.putNotNulls(idx, 4)
@@ -109,6 +117,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
reference += false
reference += false
idx += 4
assert(column.hasNull)
assert(column.numNulls() == 8)

reference.zipWithIndex.foreach { v =>

0 comments on commit 33f17b2

Please sign in to comment.
You can’t perform that action at this time.