Skip to content

Commit

Permalink
[SPARK-9876][SQL][FOLLOWUP] Enable string and binary tests for Parque…
Browse files Browse the repository at this point in the history
…t predicate pushdown and replace deprecated fromByteArray.

## What changes were proposed in this pull request?

It seems Parquet has been upgraded to 1.8.1 by #13280. So,  this PR enables string and binary predicate push down which was disabled due to [SPARK-11153](https://issues.apache.org/jira/browse/SPARK-11153) and [PARQUET-251](https://issues.apache.org/jira/browse/PARQUET-251) and cleans up some comments unremoved (I think by mistake).

This PR also replace the API, `fromByteArray()` deprecated in [PARQUET-251](https://issues.apache.org/jira/browse/PARQUET-251).

## How was this patch tested?

Unit tests in `ParquetFilters`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #13389 from HyukjinKwon/parquet-1.8-followup.
  • Loading branch information
HyukjinKwon authored and liancheng committed Jul 5, 2016
1 parent 7f7eb39 commit 07d9c53
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ public final void readBinary(int total, ColumnVector v, int rowId) {

@Override
public final Binary readBinary(int len) {
Binary result = Binary.fromByteArray(buffer, offset - Platform.BYTE_ARRAY_OFFSET, len);
Binary result = Binary.fromConstantByteArray(buffer, offset - Platform.BYTE_ARRAY_OFFSET, len);
offset += len;
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi

case StringType =>
(row: SpecializedGetters, ordinal: Int) =>
recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
recordConsumer.addBinary(
Binary.fromReusedByteArray(row.getUTF8String(ordinal).getBytes))

case TimestampType =>
(row: SpecializedGetters, ordinal: Int) => {
Expand All @@ -165,12 +166,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
val buf = ByteBuffer.wrap(timestampBuffer)
buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer))
recordConsumer.addBinary(Binary.fromReusedByteArray(timestampBuffer))
}

case BinaryType =>
(row: SpecializedGetters, ordinal: Int) =>
recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
recordConsumer.addBinary(Binary.fromReusedByteArray(row.getBinary(ordinal)))

case DecimalType.Fixed(precision, scale) =>
makeDecimalWriter(precision, scale)
Expand Down Expand Up @@ -227,7 +228,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
shift -= 8
}

recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
recordConsumer.addBinary(Binary.fromReusedByteArray(decimalBuffer, 0, numBytes))
}

val binaryWriterUsingUnscaledBytes =
Expand All @@ -248,7 +249,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
decimalBuffer
}

recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes))
recordConsumer.addBinary(Binary.fromReusedByteArray(fixedLengthBytes, 0, numBytes))
}

writeLegacyParquetFormat match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])

// Binary.fromString and Binary.fromByteArray don't accept null values
case StringType =>
(n: String, v: Any) => FilterApi.eq(
Expand All @@ -73,7 +72,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])

case StringType =>
(n: String, v: Any) => FilterApi.notEq(
binaryColumn(n),
Expand All @@ -93,7 +91,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])

case StringType =>
(n: String, v: Any) =>
FilterApi.lt(binaryColumn(n),
Expand All @@ -112,7 +109,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])

case StringType =>
(n: String, v: Any) =>
FilterApi.ltEq(binaryColumn(n),
Expand All @@ -131,8 +127,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])

// See https://issues.apache.org/jira/browse/SPARK-11153
case StringType =>
(n: String, v: Any) =>
FilterApi.gt(binaryColumn(n),
Expand All @@ -151,7 +145,6 @@ private[sql] object ParquetFilters {
(n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
case DoubleType =>
(n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])

case StringType =>
(n: String, v: Any) =>
FilterApi.gtEq(binaryColumn(n),
Expand All @@ -174,7 +167,6 @@ private[sql] object ParquetFilters {
case DoubleType =>
(n: String, v: Set[Any]) =>
FilterApi.userDefined(doubleColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Double]]))

case StringType =>
(n: String, v: Set[Any]) =>
FilterApi.userDefined(binaryColumn(n),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
}
}

// See https://issues.apache.org/jira/browse/SPARK-11153
ignore("filter pushdown - string") {
test("filter pushdown - string") {
withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
checkFilterPredicate(
Expand Down Expand Up @@ -258,8 +257,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
}
}

// See https://issues.apache.org/jira/browse/SPARK-11153
ignore("filter pushdown - binary") {
test("filter pushdown - binary") {
implicit class IntToBinary(int: Int) {
def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8)
}
Expand Down

0 comments on commit 07d9c53

Please sign in to comment.