From 45a123d24231b82a244e30d5f3bfc5fdf0e629c1 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 00:22:26 -0700 Subject: [PATCH 01/15] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package. Unit test is still in Scala. --- .../sql/catalyst/expressions/UnsafeRow.java | 2 +- .../sql/catalyst/CatalystTypeConverters.scala | 4 +- .../spark/sql/catalyst/ScalaReflection.scala | 1 + .../spark/sql/catalyst/expressions/Cast.scala | 9 +- .../expressions/SpecificMutableRow.scala | 4 +- .../expressions/UnsafeRowConverter.scala | 1 + .../expressions/codegen/CodeGenerator.scala | 2 + .../sql/catalyst/expressions/literals.scala | 3 +- .../spark/sql/catalyst/expressions/rows.scala | 7 +- .../expressions/stringOperations.scala | 1 + .../apache/spark/sql/types/StringType.scala | 2 + .../apache/spark/sql/types/UTF8String.scala | 221 ------------------ .../expressions/ComplexTypeSuite.scala | 1 + .../UnsafeFixedWidthAggregationMapSuite.scala | 6 +- .../spark/sql/types/UTF8StringSuite.scala | 50 ++-- .../spark/sql/columnar/ColumnStats.scala | 1 + .../spark/sql/columnar/ColumnType.scala | 4 +- .../sql/execution/SparkSqlSerializer2.scala | 4 +- .../spark/sql/execution/debug/package.scala | 1 + .../spark/sql/execution/pythonUdfs.scala | 8 +- .../org/apache/spark/sql/jdbc/JDBCRDD.scala | 1 + .../apache/spark/sql/json/JacksonParser.scala | 11 +- .../org/apache/spark/sql/json/JsonRDD.scala | 8 +- .../spark/sql/parquet/ParquetConverter.scala | 7 +- .../spark/sql/parquet/ParquetFilters.scala | 1 + .../sql/parquet/ParquetTableSupport.scala | 1 + .../sql/sources/DataSourceStrategy.scala | 3 +- .../spark/sql/columnar/ColumnTypeSuite.scala | 8 +- .../sql/columnar/ColumnarTestUtils.scala | 7 +- .../spark/sql/hive/HiveInspectors.scala | 13 +- .../apache/spark/unsafe/types/UTF8String.java | 198 ++++++++++++++++ 31 files changed, 307 insertions(+), 283 deletions(-) delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala create mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index ec97fe603c44f..143acc9f5e36f 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -30,7 +30,7 @@ import org.apache.spark.sql.BaseMutableRow; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.UTF8String; +import org.apache.spark.unsafe.types.UTF8String; import org.apache.spark.unsafe.PlatformDependent; import org.apache.spark.unsafe.bitset.BitSetMethods; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index beb82dbc08642..3f13c368fc510 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -23,6 +23,8 @@ import java.sql.{Timestamp, Date} import java.util.{Map => JavaMap} import javax.annotation.Nullable +import org.apache.spark.unsafe.types.UTF8String + import scala.collection.mutable.HashMap import org.apache.spark.sql.catalyst.expressions._ @@ -257,7 +259,7 @@ object CatalystTypeConverters { private object StringConverter extends CatalystTypeConverter[Any, String, Any] { override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match { - case str: String => UTF8String(str) + case str: String => UTF8String.fromString(str) case utf8: UTF8String => utf8 } override def toScala(catalystValue: Any): String = catalystValue match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 6998cc8d9666d..90698cd572de4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 8d93957fea2fc..7a9f48c962b90 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -24,6 +24,7 @@ import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** Cast the child expression to the target data type. */ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging { @@ -111,11 +112,11 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w // UDFToString private[this] def castToString(from: DataType): Any => Any = from match { - case BinaryType => buildCast[Array[Byte]](_, UTF8String(_)) - case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d))) + case BinaryType => buildCast[Array[Byte]](_, UTF8String.fromBytes) + case DateType => buildCast[Int](_, d => UTF8String.fromString(DateUtils.toString(d))) case TimestampType => buildCast[Long](_, - t => UTF8String(timestampToString(DateUtils.toJavaTimestamp(t)))) - case _ => buildCast[Any](_, o => UTF8String(o.toString)) + t => UTF8String.fromString(timestampToString(DateUtils.toJavaTimestamp(t)))) + case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString)) } // BinaryConverter diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala index 2c884517d62a7..98eda61a80b40 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * A parent class for mutable container objects that are reused when the values are changed, @@ -240,7 +241,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR } } - override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value)) + override def setString(ordinal: Int, value: String): Unit = + update(ordinal, UTF8String.fromString(value)) override def getString(ordinal: Int): String = apply(ordinal).toString diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala index 5b2c8572784bd..5350123bf4c01 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.types._ import org.apache.spark.unsafe.PlatformDependent import org.apache.spark.unsafe.array.ByteArrayMethods +import org.apache.spark.unsafe.types.UTF8String /** * Converts Rows into UnsafeRow format. This class is NOT thread-safe. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 80aa8fa056146..b2705d35e95e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen +import org.apache.spark.unsafe.types.UTF8String + import scala.collection.mutable import scala.language.existentials diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 833c08a293dcb..370a7961d1caa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String object Literal { def apply(v: Any): Literal = v match { @@ -32,7 +33,7 @@ object Literal { case f: Float => Literal(f, FloatType) case b: Byte => Literal(b, ByteType) case s: Short => Literal(s, ShortType) - case s: String => Literal(UTF8String(s), StringType) + case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited) case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala index 5fd892c42e69c..5d2d82077f0eb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.types.{UTF8String, DataType, StructType, AtomicType} +import org.apache.spark.sql.types.{DataType, StructType, AtomicType} +import org.apache.spark.unsafe.types.UTF8String /** * An extended interface to [[Row]] that allows the values for each column to be updated. Setting @@ -197,7 +198,9 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow { override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value } override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value } override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value } - override def setString(ordinal: Int, value: String) { values(ordinal) = UTF8String(value)} + override def setString(ordinal: Int, value: String) { + values(ordinal) = UTF8String.fromString(value) + } override def setNullAt(i: Int): Unit = { values(i) = null } override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 856f56488c7a5..d3056071e51d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -22,6 +22,7 @@ import java.util.regex.Pattern import org.apache.spark.sql.catalyst.analysis.UnresolvedException import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String trait StringRegexExpression extends ExpectsInputTypes { self: BinaryExpression => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala index 134ab0af4e0de..d8d6db79eee93 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import org.apache.spark.unsafe.types.UTF8String + import scala.math.Ordering import scala.reflect.runtime.universe.typeTag diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala deleted file mode 100644 index f5d8fcced362b..0000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala +++ /dev/null @@ -1,221 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.spark.sql.types - -import java.util.Arrays - -import org.apache.spark.annotation.DeveloperApi - -/** - * :: DeveloperApi :: - * A UTF-8 String, as internal representation of StringType in SparkSQL - * - * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison, - * search, see http://en.wikipedia.org/wiki/UTF-8 for details. - * - * Note: This is not designed for general use cases, should not be used outside SQL. - */ -@DeveloperApi -final class UTF8String extends Ordered[UTF8String] with Serializable { - - private[this] var bytes: Array[Byte] = _ - - /** - * Update the UTF8String with String. - */ - def set(str: String): UTF8String = { - bytes = str.getBytes("utf-8") - this - } - - /** - * Update the UTF8String with Array[Byte], which should be encoded in UTF-8 - */ - def set(bytes: Array[Byte]): UTF8String = { - this.bytes = bytes - this - } - - /** - * Return the number of bytes for a code point with the first byte as `b` - * @param b The first byte of a code point - */ - @inline - private[this] def numOfBytes(b: Byte): Int = { - val offset = (b & 0xFF) - 192 - if (offset >= 0) UTF8String.bytesOfCodePointInUTF8(offset) else 1 - } - - /** - * Return the number of code points in it. - * - * This is only used by Substring() when `start` is negative. - */ - def length(): Int = { - var len = 0 - var i: Int = 0 - while (i < bytes.length) { - i += numOfBytes(bytes(i)) - len += 1 - } - len - } - - def getBytes: Array[Byte] = { - bytes - } - - /** - * Return a substring of this, - * @param start the position of first code point - * @param until the position after last code point - */ - def slice(start: Int, until: Int): UTF8String = { - if (until <= start || start >= bytes.length || bytes == null) { - new UTF8String - } - - var c = 0 - var i: Int = 0 - while (c < start && i < bytes.length) { - i += numOfBytes(bytes(i)) - c += 1 - } - var j = i - while (c < until && j < bytes.length) { - j += numOfBytes(bytes(j)) - c += 1 - } - UTF8String(Arrays.copyOfRange(bytes, i, j)) - } - - def contains(sub: UTF8String): Boolean = { - val b = sub.getBytes - if (b.length == 0) { - return true - } - var i: Int = 0 - while (i <= bytes.length - b.length) { - // In worst case, it's O(N*K), but should works fine with SQL - if (bytes(i) == b(0) && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) { - return true - } - i += 1 - } - false - } - - def startsWith(prefix: UTF8String): Boolean = { - val b = prefix.getBytes - if (b.length > bytes.length) { - return false - } - Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b) - } - - def endsWith(suffix: UTF8String): Boolean = { - val b = suffix.getBytes - if (b.length > bytes.length) { - return false - } - Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b) - } - - def toUpperCase(): UTF8String = { - // upper case depends on locale, fallback to String. - UTF8String(toString().toUpperCase) - } - - def toLowerCase(): UTF8String = { - // lower case depends on locale, fallback to String. - UTF8String(toString().toLowerCase) - } - - override def toString(): String = { - new String(bytes, "utf-8") - } - - override def clone(): UTF8String = new UTF8String().set(this.bytes) - - override def compare(other: UTF8String): Int = { - var i: Int = 0 - val b = other.getBytes - while (i < bytes.length && i < b.length) { - val res = bytes(i).compareTo(b(i)) - if (res != 0) return res - i += 1 - } - bytes.length - b.length - } - - override def compareTo(other: UTF8String): Int = { - compare(other) - } - - override def equals(other: Any): Boolean = other match { - case s: UTF8String => - Arrays.equals(bytes, s.getBytes) - case s: String => - // This is only used for Catalyst unit tests - // fail fast - bytes.length >= s.length && length() == s.length && toString() == s - case _ => - false - } - - override def hashCode(): Int = { - Arrays.hashCode(bytes) - } -} - -/** - * :: DeveloperApi :: - */ -@DeveloperApi -object UTF8String { - // number of tailing bytes in a UTF8 sequence for a code point - // see http://en.wikipedia.org/wiki/UTF-8, 192-256 of Byte 1 - private[types] val bytesOfCodePointInUTF8: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, - 6, 6, 6, 6) - - /** - * Create a UTF-8 String from String - */ - def apply(s: String): UTF8String = { - if (s != null) { - new UTF8String().set(s) - } else { - null - } - } - - /** - * Create a UTF-8 String from Array[Byte], which should be encoded in UTF-8 - */ - def apply(bytes: Array[Byte]): UTF8String = { - if (bytes != null) { - new UTF8String().set(bytes) - } else { - null - } - } -} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index f151dd2a47f78..bcc594cb7c193 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala index 88a36aa121b55..3355bf2616211 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions +import org.apache.spark.unsafe.types.UTF8String + import scala.collection.JavaConverters._ import scala.util.Random @@ -82,7 +84,7 @@ class UnsafeFixedWidthAggregationMapSuite 1024, // initial capacity false // disable perf metrics ) - val groupKey = new GenericRow(Array[Any](UTF8String("cats"))) + val groupKey = new GenericRow(Array[Any](UTF8String.fromString("cats"))) // Looking up a key stores a zero-entry in the map (like Python Counters or DefaultDicts) map.getAggregationBuffer(groupKey) @@ -111,7 +113,7 @@ class UnsafeFixedWidthAggregationMapSuite val rand = new Random(42) val groupKeys: Set[String] = Seq.fill(512)(rand.nextString(1024)).toSet groupKeys.foreach { keyString => - map.getAggregationBuffer(new GenericRow(Array[Any](UTF8String(keyString)))) + map.getAggregationBuffer(new GenericRow(Array[Any](UTF8String.fromString(keyString)))) } val seenKeys: Set[String] = map.iterator().asScala.map { entry => entry.key.getString(0) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala index 81d7ab010f394..1ee798529f937 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala @@ -18,22 +18,24 @@ package org.apache.spark.sql.types import org.apache.spark.SparkFunSuite +import org.apache.spark.unsafe.types.UTF8String // scalastyle:off class UTF8StringSuite extends SparkFunSuite { test("basic") { def check(str: String, len: Int) { - assert(UTF8String(str).length == len) - assert(UTF8String(str.getBytes("utf8")).length() == len) + assert(UTF8String.fromString(str).length === len) + assert(UTF8String.fromBytes(str.getBytes("utf8")).length() === len) - assert(UTF8String(str) == str) - assert(UTF8String(str.getBytes("utf8")) == str) - assert(UTF8String(str).toString == str) - assert(UTF8String(str.getBytes("utf8")).toString == str) - assert(UTF8String(str.getBytes("utf8")) == UTF8String(str)) + assert(UTF8String.fromString(str) === str) + assert(UTF8String.fromBytes(str.getBytes("utf8")) === str) + assert(UTF8String.fromString(str).toString === str) + assert(UTF8String.fromBytes(str.getBytes("utf8")).toString == str) + assert(UTF8String.fromBytes(str.getBytes("utf8")) === UTF8String.fromString(str)) - assert(UTF8String(str).hashCode() == UTF8String(str.getBytes("utf8")).hashCode()) + assert(UTF8String.fromString(str).hashCode() === + UTF8String.fromBytes(str.getBytes("utf8")).hashCode()) } check("hello", 5) @@ -41,30 +43,30 @@ class UTF8StringSuite extends SparkFunSuite { } test("contains") { - assert(UTF8String("hello").contains(UTF8String("ello"))) - assert(!UTF8String("hello").contains(UTF8String("vello"))) - assert(UTF8String("大千世界").contains(UTF8String("千世"))) - assert(!UTF8String("大千世界").contains(UTF8String("世千"))) + assert(UTF8String.fromString("hello").contains(UTF8String.fromString("ello"))) + assert(!UTF8String.fromString("hello").contains(UTF8String.fromString("vello"))) + assert(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世"))) + assert(!UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千"))) } test("prefix") { - assert(UTF8String("hello").startsWith(UTF8String("hell"))) - assert(!UTF8String("hello").startsWith(UTF8String("ell"))) - assert(UTF8String("大千世界").startsWith(UTF8String("大千"))) - assert(!UTF8String("大千世界").startsWith(UTF8String("千"))) + assert(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))) + assert(!UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))) + assert(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千"))) + assert(!UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千"))) } test("suffix") { - assert(UTF8String("hello").endsWith(UTF8String("ello"))) - assert(!UTF8String("hello").endsWith(UTF8String("ellov"))) - assert(UTF8String("大千世界").endsWith(UTF8String("世界"))) - assert(!UTF8String("大千世界").endsWith(UTF8String("世"))) + assert(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello"))) + assert(!UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov"))) + assert(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界"))) + assert(!UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世"))) } test("slice") { - assert(UTF8String("hello").slice(1, 3) == UTF8String("el")) - assert(UTF8String("大千世界").slice(0, 1) == UTF8String("大")) - assert(UTF8String("大千世界").slice(1, 3) == UTF8String("千世")) - assert(UTF8String("大千世界").slice(3, 5) == UTF8String("界")) + assert(UTF8String.fromString("hello").slice(1, 3) == UTF8String.fromString("el")) + assert(UTF8String.fromString("大千世界").slice(0, 1) == UTF8String.fromString("大")) + assert(UTF8String.fromString("大千世界").slice(1, 3) == UTF8String.fromString("千世")) + assert(UTF8String.fromString("大千世界").slice(3, 5) == UTF8String.fromString("界")) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala index 83881a3687090..11c79c865f11a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.columnar import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String private[sql] class ColumnStatisticsSchema(a: Attribute) extends Serializable { val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala index c9c4d630fb5f4..6ebbf93f608b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.columnar import java.nio.ByteBuffer +import org.apache.spark.unsafe.types.UTF8String + import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.Row @@ -320,7 +322,7 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) { val length = buffer.getInt() val stringBytes = new Array[Byte](length) buffer.get(stringBytes, 0, length) - UTF8String(stringBytes) + UTF8String.fromBytes(stringBytes) } override def setField(row: MutableRow, ordinal: Int, value: UTF8String): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala index 60f3b2d539ffe..8f64dcacad598 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala @@ -21,6 +21,8 @@ import java.io._ import java.math.{BigDecimal, BigInteger} import java.nio.ByteBuffer +import org.apache.spark.unsafe.types.UTF8String + import scala.reflect.ClassTag import org.apache.spark.Logging @@ -434,7 +436,7 @@ private[sql] object SparkSqlSerializer2 { val length = in.readInt() val bytes = new Array[Byte](length) in.readFully(bytes) - mutableRow.update(i, UTF8String(bytes)) + mutableRow.update(i, UTF8String.fromBytes(bytes)) } case BinaryType => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 720b529d5946f..83c1f65d5c96f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.unsafe.types.UTF8String import scala.collection.mutable.HashSet diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala index 955b478a4882f..476cb306df6c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution import java.util.{List => JList, Map => JMap} +import org.apache.spark.unsafe.types.UTF8String + import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ @@ -204,8 +206,10 @@ object EvaluatePython { case (c: Long, IntegerType) => c.toInt case (c: Int, LongType) => c.toLong case (c: Double, FloatType) => c.toFloat - case (c: String, StringType) => UTF8String(c) - case (c, StringType) if !c.isInstanceOf[String] => UTF8String(c.toString) + case (c: String, StringType) => UTF8String.fromString(c) + case (c, StringType) => + // If we get here, c is not a string. Call toString on it. + UTF8String.fromString(c.toString) case (c, _) => c } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala index 9028d5ed72c92..d8e96e85d8e52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala @@ -21,6 +21,7 @@ import java.sql.{Connection, DriverManager, ResultSet, ResultSetMetaData, SQLExc import java.util.Properties import org.apache.commons.lang3.StringUtils +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.{Logging, Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala index 4e07cf36ae434..82ad831c70957 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.json import java.io.ByteArrayOutputStream +import org.apache.spark.unsafe.types.UTF8String + import scala.collection.Map import com.fasterxml.jackson.core._ @@ -54,7 +56,7 @@ private[sql] object JacksonParser { convertField(factory, parser, schema) case (VALUE_STRING, StringType) => - UTF8String(parser.getText) + UTF8String.fromString(parser.getText) case (VALUE_STRING, _) if parser.getTextLength < 1 => // guard the non string type @@ -74,7 +76,7 @@ private[sql] object JacksonParser { val generator = factory.createGenerator(writer, JsonEncoding.UTF8) generator.copyCurrentStructure(parser) generator.close() - UTF8String(writer.toByteArray) + UTF8String.fromBytes(writer.toByteArray) case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) => parser.getFloatValue @@ -152,7 +154,8 @@ private[sql] object JacksonParser { valueType: DataType): Map[UTF8String, Any] = { val builder = Map.newBuilder[UTF8String, Any] while (nextUntil(parser, JsonToken.END_OBJECT)) { - builder += UTF8String(parser.getCurrentName) -> convertField(factory, parser, valueType) + builder += + UTF8String.fromString(parser.getCurrentName) -> convertField(factory, parser, valueType) } builder.result() @@ -180,7 +183,7 @@ private[sql] object JacksonParser { val row = new GenericMutableRow(schema.length) for (corruptIndex <- schema.getFieldIndex(columnNameOfCorruptRecords)) { require(schema(corruptIndex).dataType == StringType) - row.update(corruptIndex, UTF8String(record)) + row.update(corruptIndex, UTF8String.fromString(record)) } Seq(row) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala index fb0d137bdbfdb..e4acf1ddaf173 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala @@ -30,6 +30,8 @@ import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + private[sql] object JsonRDD extends Logging { @@ -317,7 +319,7 @@ private[sql] object JsonRDD extends Logging { parsed } catch { case e: JsonProcessingException => - Map(columnNameOfCorruptRecords -> UTF8String(record)) :: Nil + Map(columnNameOfCorruptRecords -> UTF8String.fromString(record)) :: Nil } } }) @@ -409,7 +411,7 @@ private[sql] object JsonRDD extends Logging { null } else { desiredType match { - case StringType => UTF8String(toString(value)) + case StringType => UTF8String.fromString(toString(value)) case _ if value == null || value == "" => null // guard the non string type case IntegerType => value.asInstanceOf[IntegerType.InternalType] case LongType => toLong(value) @@ -423,7 +425,7 @@ private[sql] object JsonRDD extends Logging { val map = value.asInstanceOf[Map[String, Any]] map.map { case (k, v) => - (UTF8String(k), enforceCorrectType(v, valueType)) + (UTF8String.fromString(k), enforceCorrectType(v, valueType)) }.map(identity) case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct) case DateType => toDate(value) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala index ddc5097f88fb1..ab9f878d1e936 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.parquet.CatalystConverter.FieldType import org.apache.spark.sql.types._ import org.apache.spark.sql.parquet.timestamp.NanoTime +import org.apache.spark.unsafe.types.UTF8String /** * Collection of converters of Parquet types (group and primitive types) that @@ -222,7 +223,7 @@ private[parquet] abstract class CatalystConverter extends GroupConverter { updateField(fieldIndex, value.getBytes) protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = - updateField(fieldIndex, UTF8String(value)) + updateField(fieldIndex, UTF8String.fromBytes(value)) protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit = updateField(fieldIndex, readTimestamp(value)) @@ -423,7 +424,7 @@ private[parquet] class CatalystPrimitiveRowConverter( current.update(fieldIndex, value.getBytes) override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = - current.update(fieldIndex, UTF8String(value)) + current.update(fieldIndex, UTF8String.fromBytes(value)) override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit = current.setLong(fieldIndex, readTimestamp(value)) @@ -719,7 +720,7 @@ private[parquet] class CatalystNativeArrayConverter( override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = { checkGrowBuffer() - buffer(elements) = UTF8String(value).asInstanceOf[NativeType] + buffer(elements) = UTF8String.fromBytes(value).asInstanceOf[NativeType] elements += 1 } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala index 88ae88e9684c8..4d659f261a3b7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala @@ -31,6 +31,7 @@ import org.apache.spark.SparkEnv import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.sources import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String private[sql] object ParquetFilters { val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala index e03dbdec0491d..c62c592b3f3e4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala @@ -31,6 +31,7 @@ import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * A `parquet.io.api.RecordMaterializer` for Rows. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala index c6a4dabbab05e..ec9ec67e9fdc2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.sources +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.{Logging, SerializableWritable, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} @@ -26,7 +27,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.types.{StringType, StructType, UTF8String} +import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources} import org.apache.spark.util.Utils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala index 8421e670ff05d..6daddfb2c4804 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala @@ -22,12 +22,14 @@ import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.{Kryo, Serializer} +import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoRegistrator import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.columnar.ColumnarTestUtils._ import org.apache.spark.sql.execution.SparkSqlSerializer import org.apache.spark.sql.types._ -import org.apache.spark.{Logging, SparkConf, SparkFunSuite} +import org.apache.spark.unsafe.types.UTF8String + class ColumnTypeSuite extends SparkFunSuite with Logging { val DEFAULT_BUFFER_SIZE = 512 @@ -66,7 +68,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging { checkActualSize(FLOAT, Float.MaxValue, 4) checkActualSize(FIXED_DECIMAL(15, 10), Decimal(0, 15, 10), 8) checkActualSize(BOOLEAN, true, 1) - checkActualSize(STRING, UTF8String("hello"), 4 + "hello".getBytes("utf-8").length) + checkActualSize(STRING, UTF8String.fromString("hello"), 4 + "hello".getBytes("utf-8").length) checkActualSize(DATE, 0, 4) checkActualSize(TIMESTAMP, 0L, 8) @@ -118,7 +120,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging { val length = buffer.getInt() val bytes = new Array[Byte](length) buffer.get(bytes) - UTF8String(bytes) + UTF8String.fromBytes(bytes) }) testColumnType[BinaryType.type, Array[Byte]]( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala index c5d38595c0bec..1bc7eb36311bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala @@ -22,7 +22,10 @@ import scala.util.Random import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericMutableRow -import org.apache.spark.sql.types.{AtomicType, DataType, Decimal, UTF8String} +import org.apache.spark.sql.types.{AtomicType, DataType, Decimal} +import org.apache.spark.sql.types.{DataType, Decimal, AtomicType} +import org.apache.spark.unsafe.types.UTF8String + object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { @@ -46,7 +49,7 @@ object ColumnarTestUtils { case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) - case STRING => UTF8String(Random.nextString(Random.nextInt(32))) + case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BOOLEAN => Random.nextBoolean() case BINARY => randomBytes(Random.nextInt(32)) case DATE => Random.nextInt() diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 1f14cba78f479..fd01a8722bce6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /* Implicit conversions */ import scala.collection.JavaConversions._ @@ -242,9 +243,9 @@ private[hive] trait HiveInspectors { def unwrap(data: Any, oi: ObjectInspector): Any = oi match { case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null case poi: WritableConstantStringObjectInspector => - UTF8String(poi.getWritableConstantValue.toString) + UTF8String.fromString(poi.getWritableConstantValue.toString) case poi: WritableConstantHiveVarcharObjectInspector => - UTF8String(poi.getWritableConstantValue.getHiveVarchar.getValue) + UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue) case poi: WritableConstantHiveDecimalObjectInspector => HiveShim.toCatalystDecimal( PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector, @@ -288,13 +289,13 @@ private[hive] trait HiveInspectors { case pi: PrimitiveObjectInspector => pi match { // We think HiveVarchar is also a String case hvoi: HiveVarcharObjectInspector if hvoi.preferWritable() => - UTF8String(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue) + UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue) case hvoi: HiveVarcharObjectInspector => - UTF8String(hvoi.getPrimitiveJavaObject(data).getValue) + UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue) case x: StringObjectInspector if x.preferWritable() => - UTF8String(x.getPrimitiveWritableObject(data).toString) + UTF8String.fromString(x.getPrimitiveWritableObject(data).toString) case x: StringObjectInspector => - UTF8String(x.getPrimitiveJavaObject(data)) + UTF8String.fromString(x.getPrimitiveJavaObject(data)) case x: IntObjectInspector if x.preferWritable() => x.get(data) case x: BooleanObjectInspector if x.preferWritable() => x.get(data) case x: FloatObjectInspector if x.preferWritable() => x.get(data) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java new file mode 100644 index 0000000000000..cb9081b7dd9a7 --- /dev/null +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.unsafe.types; + +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import javax.annotation.Nullable; + +/** + * A UTF-8 String for internal Spark use. + *

+ * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison, + * search, see http://en.wikipedia.org/wiki/UTF-8 for details. + *

+ * Note: This is not designed for general use cases, should not be used outside SQL. + */ +public final class UTF8String implements Comparable, Serializable { + + @Nullable + private byte[] bytes; + + private int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, + 6, 6, 6, 6}; + + public static UTF8String fromBytes(byte[] bytes) { + return (bytes != null) ? new UTF8String().set(bytes) : null; + } + + public static UTF8String fromString(String str) { + return (str != null) ? new UTF8String().set(str) : null; + } + + /** + * Updates the UTF8String with String. + */ + public UTF8String set(final String str) { + bytes = str.getBytes(); + return this; + } + + /** + * Updates the UTF8String with byte[], which should be encoded in UTF-8. + */ + public UTF8String set(final byte[] bytes) { + this.bytes = bytes; + return this; + } + + /** + * Returns the number of bytes for a code point with the first byte as `b` + * @param b The first byte of a code point + */ + public int numBytes(final byte b) { + final int offset = (b & 0xFF) - 192; + return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1; + } + + /** + * Returns the number of code points in it. + * + * This is only used by Substring() when `start` is negative. + */ + public int length() { + int len = 0; + for (int i = 0; i < bytes.length; i+= numBytes(bytes[i])) { + len += 1; + } + return len; + } + + public byte[] getBytes() { + return bytes; + } + + /** + * Returns a substring of this. + * @param start the position of first code point + * @param until the position after last code point + */ + public UTF8String slice(final int start, final int until) { + if (until <= start || start >= bytes.length) { + return new UTF8String(); + } + + int i = 0; + int c = 0; + for (; i < bytes.length && c < start; i += numBytes(bytes[i])) { + c += 1; + } + + int j = i; + for (; j < bytes.length && c < until; j += numBytes(bytes[i])) { + c += 1; + } + + return UTF8String.fromBytes(Arrays.copyOfRange(bytes, i, j)); + } + + public boolean contains(final UTF8String substring) { + final byte[] b = substring.getBytes(); + if (b.length == 0) { + return true; + } + + for (int i = 0; i <= bytes.length - b.length; i++) { + // TODO: Avoid copying. + if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) { + return true; + } + } + return false; + } + + public boolean startsWith(final UTF8String prefix) { + final byte[] b = prefix.getBytes(); + // TODO: Avoid copying. + return b.length > bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b); + } + + public boolean endsWith(final UTF8String suffix) { + final byte[] b = suffix.getBytes(); + return b.length > bytes.length && + Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b); + } + + public UTF8String toUpperCase() { + return UTF8String.fromString(toString().toUpperCase()); + } + + public UTF8String toLowerCase() { + return UTF8String.fromString(toString().toLowerCase()); + } + + @Override + public String toString() { + try { + return new String(bytes, "utf-8"); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + return "unsupported encoding"; + } + } + + @Override + public UTF8String clone() { + return new UTF8String().set(bytes); + } + + @Override + public int compareTo(final UTF8String other) { + final byte[] b = other.getBytes(); + for (int i = 0; i < bytes.length && i < b.length; i++) { + int res = bytes[i] - b[i]; + if (res != 0) { + return res; + } + } + return bytes.length - b.length; + } + + @Override + public boolean equals(final Object other) { + if (other instanceof UTF8String) { + return Arrays.equals(bytes, ((UTF8String) other).getBytes()); + } else if (other instanceof String) { + // Used only in unit tests. + String s = (String) other; + return bytes.length >= s.length() && length() == s.length() && toString().equals(s); + } else { + return false; + } + } + + @Override + public int hashCode() { + return Arrays.hashCode(bytes); + } +} From 0967ce6872a8e475b52de8c8e891e9105ed9ce73 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 00:28:16 -0700 Subject: [PATCH 02/15] Fixed import ordering. --- .../spark/sql/catalyst/CatalystTypeConverters.scala | 3 +-- .../sql/catalyst/expressions/codegen/CodeGenerator.scala | 4 ++-- .../scala/org/apache/spark/sql/types/StringType.scala | 3 +-- .../expressions/UnsafeFixedWidthAggregationMapSuite.scala | 8 ++++---- .../scala/org/apache/spark/sql/columnar/ColumnType.scala | 3 +-- .../apache/spark/sql/execution/SparkSqlSerializer2.scala | 3 +-- .../scala/org/apache/spark/sql/execution/pythonUdfs.scala | 5 ++--- .../main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala | 2 +- .../scala/org/apache/spark/sql/json/JacksonParser.scala | 4 ++-- .../org/apache/spark/sql/sources/DataSourceStrategy.scala | 2 +- 10 files changed, 16 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 3f13c368fc510..7e4b11a4951b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -23,13 +23,12 @@ import java.sql.{Timestamp, Date} import java.util.{Map => JavaMap} import javax.annotation.Nullable -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.mutable.HashMap import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * Functions to convert Scala types to Catalyst types and vice versa. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index b2705d35e95e7..0e17cc76e3fea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.catalyst.expressions.codegen -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.mutable import scala.language.existentials @@ -28,6 +26,8 @@ import org.codehaus.janino.ClassBodyEvaluator import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + // These classes are here to avoid issues with serialization and integration with quasiquotes. class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala index d8d6db79eee93..1e9476ad06656 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql.types -import org.apache.spark.unsafe.types.UTF8String - import scala.math.Ordering import scala.reflect.runtime.universe.typeTag import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.ScalaReflectionLock +import org.apache.spark.unsafe.types.UTF8String /** * :: DeveloperApi :: diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala index 3355bf2616211..72bbc4efeb8ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala @@ -17,16 +17,16 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.JavaConverters._ import scala.util.Random -import org.apache.spark.SparkFunSuite -import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator} import org.scalatest.{BeforeAndAfterEach, Matchers} +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator} +import org.apache.spark.unsafe.types.UTF8String + class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala index 6ebbf93f608b9..8e21020917768 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala @@ -19,14 +19,13 @@ package org.apache.spark.sql.columnar import java.nio.ByteBuffer -import org.apache.spark.unsafe.types.UTF8String - import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.execution.SparkSqlSerializer import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * An abstract class that represents type of a column. Used to append/extract Java objects into/from diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala index 8f64dcacad598..202e4488a64bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala @@ -21,8 +21,6 @@ import java.io._ import java.math.{BigDecimal, BigInteger} import java.nio.ByteBuffer -import org.apache.spark.unsafe.types.UTF8String - import scala.reflect.ClassTag import org.apache.spark.Logging @@ -30,6 +28,7 @@ import org.apache.spark.serializer._ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, MutableRow, SpecificMutableRow} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * The serialization stream for [[SparkSqlSerializer2]]. It assumes that the object passed in diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala index 476cb306df6c0..660b231dd7d17 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala @@ -19,13 +19,12 @@ package org.apache.spark.sql.execution import java.util.{List => JList, Map => JMap} -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} +import org.apache.spark.{Accumulator, Logging => SparkLogging} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast @@ -37,7 +36,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ -import org.apache.spark.{Accumulator, Logging => SparkLogging} +import org.apache.spark.unsafe.types.UTF8String /** * A serialized version of a Python lambda function. Suitable for use in a [[PythonRDD]]. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala index d8e96e85d8e52..e75e6681c5ff3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala @@ -21,7 +21,6 @@ import java.sql.{Connection, DriverManager, ResultSet, ResultSetMetaData, SQLExc import java.util.Properties import org.apache.commons.lang3.StringUtils -import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.{Logging, Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD @@ -29,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.{Row, SpecificMutableRow} import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.sources._ +import org.apache.spark.unsafe.types.UTF8String /** * Data corresponding to one partition of a JDBCRDD. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala index 82ad831c70957..f16075ce58ffa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.json import java.io.ByteArrayOutputStream -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.Map import com.fasterxml.jackson.core._ @@ -30,6 +28,8 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.json.JacksonUtils.nextUntil import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + private[sql] object JacksonParser { def apply( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala index ec9ec67e9fdc2..edda3f2017fe8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.sources -import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.{Logging, SerializableWritable, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} @@ -30,6 +29,7 @@ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources} import org.apache.spark.util.Utils +import org.apache.spark.unsafe.types.UTF8String /** * A Strategy for planning scans over data sources defined using the sources API. From ffedb624429709fa325572da1c5ecebc3bfc03d0 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 14:52:11 -0700 Subject: [PATCH 03/15] Code review feedback. Still need to fix test failure. --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index cb9081b7dd9a7..8f3bcbd1c4d67 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -22,6 +22,8 @@ import java.util.Arrays; import javax.annotation.Nullable; +import org.apache.spark.unsafe.PlatformDependent; + /** * A UTF-8 String for internal Spark use. *

@@ -35,7 +37,7 @@ public final class UTF8String implements Comparable, Serializable { @Nullable private byte[] bytes; - private int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, @@ -156,8 +158,10 @@ public String toString() { try { return new String(bytes, "utf-8"); } catch (UnsupportedEncodingException e) { - e.printStackTrace(); - return "unsupported encoding"; + // Turn the exception into unchecked so we can find out about it at runtime, but + // don't need to add lots of boilerplate code everywhere. + PlatformDependent.throwException(e); + return "unknown"; // we will never reach here. } } From 77c64bd4c1cc9426ae8fb8ef39ae92ec29c79969 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 14:56:06 -0700 Subject: [PATCH 04/15] Fixed string type codegen. --- .../spark/sql/catalyst/expressions/predicates.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 2c49352874fc3..c9dc4f14db698 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -254,12 +254,12 @@ abstract class BinaryComparison extends BinaryExpression with Predicate { case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, { (c1, c3) => s"$c1 $symbol $c3" }) - case DateType | TimestampType => defineCodeGen (ctx, ev, { - (c1, c3) => s"$c1 $symbol $c3" - }) - case other => defineCodeGen (ctx, ev, { - (c1, c2) => s"$c1.compare($c2) $symbol 0" - }) + case StringType => + defineCodeGen (ctx, ev, (c1, c2) => s"$c1.compareTo($c2) $symbol 0") + case DateType | TimestampType => + defineCodeGen (ctx, ev, (c1, c3) => s"$c1 $symbol $c3") + case _ => + defineCodeGen (ctx, ev, (c1, c2) => s"$c1.compare($c2) $symbol 0") } } From 8e89a3ca7582a4e470f60717e5d5e8d8ef679293 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 17:16:41 -0700 Subject: [PATCH 05/15] Fixed tests. --- .../scala/org/apache/spark/sql/types/UTF8StringSuite.scala | 1 - .../main/java/org/apache/spark/unsafe/types/UTF8String.java | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala index 1ee798529f937..501a7c00d6f72 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala @@ -24,7 +24,6 @@ import org.apache.spark.unsafe.types.UTF8String class UTF8StringSuite extends SparkFunSuite { test("basic") { def check(str: String, len: Int) { - assert(UTF8String.fromString(str).length === len) assert(UTF8String.fromBytes(str.getBytes("utf8")).length() === len) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 8f3bcbd1c4d67..f391cac28eb0c 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -136,12 +136,12 @@ public boolean contains(final UTF8String substring) { public boolean startsWith(final UTF8String prefix) { final byte[] b = prefix.getBytes(); // TODO: Avoid copying. - return b.length > bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b); + return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b); } public boolean endsWith(final UTF8String suffix) { final byte[] b = suffix.getBytes(); - return b.length > bytes.length && + return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b); } From 4eff7bdb7d2960756d089e1e4d779734da7d7f64 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 17:20:59 -0700 Subject: [PATCH 06/15] Improved unit test coverage. --- .../scala/org/apache/spark/sql/types/UTF8StringSuite.scala | 7 +++++++ .../java/org/apache/spark/unsafe/types/UTF8String.java | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala index 501a7c00d6f72..87d2fa08c69d0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala @@ -44,25 +44,32 @@ class UTF8StringSuite extends SparkFunSuite { test("contains") { assert(UTF8String.fromString("hello").contains(UTF8String.fromString("ello"))) assert(!UTF8String.fromString("hello").contains(UTF8String.fromString("vello"))) + assert(!UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo"))) assert(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世"))) assert(!UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千"))) + assert(!UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好"))) } test("prefix") { assert(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))) assert(!UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))) + assert(!UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo"))) assert(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千"))) assert(!UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千"))) + assert(!UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好"))) } test("suffix") { assert(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello"))) assert(!UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov"))) + assert(!UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello"))) assert(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界"))) assert(!UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世"))) + assert(!UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("我的大千世界"))) } test("slice") { + assert(UTF8String.fromString("hello").slice(0, 0) == UTF8String.fromString("")) assert(UTF8String.fromString("hello").slice(1, 3) == UTF8String.fromString("el")) assert(UTF8String.fromString("大千世界").slice(0, 1) == UTF8String.fromString("大")) assert(UTF8String.fromString("大千世界").slice(1, 3) == UTF8String.fromString("千世")) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index f391cac28eb0c..d9fb9c56d0d41 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -97,11 +97,11 @@ public byte[] getBytes() { /** * Returns a substring of this. * @param start the position of first code point - * @param until the position after last code point + * @param until the position after last code point, exclusive. */ public UTF8String slice(final int start, final int until) { if (until <= start || start >= bytes.length) { - return new UTF8String(); + return UTF8String.fromBytes(new byte[0]); } int i = 0; From 911c450acc97ecadce107dfbc036c2a509fd0bc7 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 17:32:24 -0700 Subject: [PATCH 07/15] Moved unit test also to Java. --- .../expressions/stringOperations.scala | 2 +- .../spark/sql/types/UTF8StringSuite.scala | 78 ---------------- .../apache/spark/unsafe/types/UTF8String.java | 2 +- .../spark/unsafe/bitset/BitSetSuite.java | 1 - .../spark/unsafe/types/UTF8StringSuite.java | 93 +++++++++++++++++++ 5 files changed, 95 insertions(+), 81 deletions(-) delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala create mode 100644 unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index d3056071e51d1..e712fc3d73762 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -278,7 +278,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression) ba.slice(st, end) case s: UTF8String => val (st, end) = slicePos(start, length, () => s.length()) - s.slice(st, end) + s.substring(st, end) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala deleted file mode 100644 index 87d2fa08c69d0..0000000000000 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.spark.sql.types - -import org.apache.spark.SparkFunSuite -import org.apache.spark.unsafe.types.UTF8String - -// scalastyle:off -class UTF8StringSuite extends SparkFunSuite { - test("basic") { - def check(str: String, len: Int) { - assert(UTF8String.fromString(str).length === len) - assert(UTF8String.fromBytes(str.getBytes("utf8")).length() === len) - - assert(UTF8String.fromString(str) === str) - assert(UTF8String.fromBytes(str.getBytes("utf8")) === str) - assert(UTF8String.fromString(str).toString === str) - assert(UTF8String.fromBytes(str.getBytes("utf8")).toString == str) - assert(UTF8String.fromBytes(str.getBytes("utf8")) === UTF8String.fromString(str)) - - assert(UTF8String.fromString(str).hashCode() === - UTF8String.fromBytes(str.getBytes("utf8")).hashCode()) - } - - check("hello", 5) - check("世 界", 3) - } - - test("contains") { - assert(UTF8String.fromString("hello").contains(UTF8String.fromString("ello"))) - assert(!UTF8String.fromString("hello").contains(UTF8String.fromString("vello"))) - assert(!UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo"))) - assert(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世"))) - assert(!UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千"))) - assert(!UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好"))) - } - - test("prefix") { - assert(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))) - assert(!UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))) - assert(!UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo"))) - assert(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千"))) - assert(!UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千"))) - assert(!UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好"))) - } - - test("suffix") { - assert(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello"))) - assert(!UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov"))) - assert(!UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello"))) - assert(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界"))) - assert(!UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世"))) - assert(!UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("我的大千世界"))) - } - - test("slice") { - assert(UTF8String.fromString("hello").slice(0, 0) == UTF8String.fromString("")) - assert(UTF8String.fromString("hello").slice(1, 3) == UTF8String.fromString("el")) - assert(UTF8String.fromString("大千世界").slice(0, 1) == UTF8String.fromString("大")) - assert(UTF8String.fromString("大千世界").slice(1, 3) == UTF8String.fromString("千世")) - assert(UTF8String.fromString("大千世界").slice(3, 5) == UTF8String.fromString("界")) - } -} diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index d9fb9c56d0d41..322a1e54777cd 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -99,7 +99,7 @@ public byte[] getBytes() { * @param start the position of first code point * @param until the position after last code point, exclusive. */ - public UTF8String slice(final int start, final int until) { + public UTF8String substring(final int start, final int until) { if (until <= start || start >= bytes.length) { return UTF8String.fromBytes(new byte[0]); } diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java index 18393db9f382f..a93fc0ee297c4 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.unsafe.bitset; import junit.framework.Assert; -import org.apache.spark.unsafe.bitset.BitSet; import org.junit.Test; import org.apache.spark.unsafe.memory.MemoryBlock; diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java new file mode 100644 index 0000000000000..bf0197c645f49 --- /dev/null +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -0,0 +1,93 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.unsafe.types; + +import java.io.UnsupportedEncodingException; + +import junit.framework.Assert; +import org.junit.Test; + +public class UTF8StringSuite { + + private void checkBasic(String str, int len) throws UnsupportedEncodingException { + Assert.assertEquals(UTF8String.fromString(str).length(), len); + Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len); + + Assert.assertEquals(UTF8String.fromString(str), str); + Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), str); + Assert.assertEquals(UTF8String.fromString(str).toString(), str); + Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str); + Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str)); + + Assert.assertEquals(UTF8String.fromString(str).hashCode(), + UTF8String.fromBytes(str.getBytes("utf8")).hashCode()); + } + + @Test + public void basicTest() throws UnsupportedEncodingException { + checkBasic("hello", 5); + checkBasic("世 界", 3); + } + + @Test + public void contains() { + Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello"))); + Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello"))); + Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo"))); + Assert.assertTrue(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世"))); + Assert.assertFalse(UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千"))); + Assert.assertFalse( + UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好"))); + } + + @Test + public void startsWith() { + Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))); + Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))); + Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo"))); + Assert.assertTrue(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千"))); + Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千"))); + Assert.assertFalse( + UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好"))); + } + + @Test + public void endsWith() { + Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello"))); + Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov"))); + Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello"))); + Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界"))); + Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世"))); + Assert.assertFalse( + UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("我的大千世界"))); + } + + @Test + public void substring() { + Assert.assertEquals( + UTF8String.fromString("hello").substring(0, 0), UTF8String.fromString("")); + Assert.assertEquals( + UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el")); + Assert.assertEquals( + UTF8String.fromString("大千世界").substring(0, 1), UTF8String.fromString("大")); + Assert.assertEquals( + UTF8String.fromString("大千世界").substring(1, 3), UTF8String.fromString("千世")); + Assert.assertEquals( + UTF8String.fromString("大千世界").substring(3, 5), UTF8String.fromString("界")); + } +} From 9a48e8dbd5f2c9917be65469367baa8e1c707bb5 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 18:39:24 -0700 Subject: [PATCH 08/15] Fixed runtime compilation error. --- .../apache/spark/sql/catalyst/expressions/predicates.scala | 7 ++----- .../java/org/apache/spark/unsafe/types/UTF8String.java | 4 ++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index c9dc4f14db698..5729e6c95fd49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -251,11 +251,8 @@ abstract class BinaryComparison extends BinaryExpression with Predicate { override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { left.dataType match { - case dt: NumericType if ctx.isNativeType(dt) => defineCodeGen (ctx, ev, { - (c1, c3) => s"$c1 $symbol $c3" - }) - case StringType => - defineCodeGen (ctx, ev, (c1, c2) => s"$c1.compareTo($c2) $symbol 0") + case dt: NumericType if ctx.isNativeType(dt) => + defineCodeGen (ctx, ev, (c1, c3) => s"$c1 $symbol $c3") case DateType | TimestampType => defineCodeGen (ctx, ev, (c1, c3) => s"$c1 $symbol $c3") case _ => diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 322a1e54777cd..ce50289600b0f 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -182,6 +182,10 @@ public int compareTo(final UTF8String other) { return bytes.length - b.length; } + public int compare(final UTF8String other) { + return compareTo(other); + } + @Override public boolean equals(final Object other) { if (other instanceof UTF8String) { From 53f8ef41b745ebe3832be6acbfeb3afd2d485e55 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 10 Jun 2015 21:00:36 -0700 Subject: [PATCH 09/15] Hack Jenkins to run one test. --- dev/run-tests | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/run-tests b/dev/run-tests index d178e2a4601ea..9b53be97550bf 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -180,6 +180,9 @@ echo "=========================================================================" CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS { + + sbt/sbt -Phive -Dspark.hive.whitelist="inputddl5.*" "hive/test-only org.apache.spark.sql.hive.execution.HiveCompatibilitySuite" + # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled. # This must be a single argument, as it is. if [ -n "$_RUN_SQL_TESTS" ]; then From 2cb3c699e43c4f7f838191245c078d7bf4e8bfe8 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 10:30:26 -0700 Subject: [PATCH 10/15] Use utf-8 encoding in set bytes. --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index ce50289600b0f..a35168019549e 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -56,7 +56,13 @@ public static UTF8String fromString(String str) { * Updates the UTF8String with String. */ public UTF8String set(final String str) { - bytes = str.getBytes(); + try { + bytes = str.getBytes("utf-8"); + } catch (UnsupportedEncodingException e) { + // Turn the exception into unchecked so we can find out about it at runtime, but + // don't need to add lots of boilerplate code everywhere. + PlatformDependent.throwException(e); + } return this; } From 82d58cc554240cc7ca579372fa8e88402edc70a4 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 10:31:43 -0700 Subject: [PATCH 11/15] Reset run-tests. --- dev/run-tests | 3 --- 1 file changed, 3 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index 9b53be97550bf..d178e2a4601ea 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -180,9 +180,6 @@ echo "=========================================================================" CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS { - - sbt/sbt -Phive -Dspark.hive.whitelist="inputddl5.*" "hive/test-only org.apache.spark.sql.hive.execution.HiveCompatibilitySuite" - # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled. # This must be a single argument, as it is. if [ -n "$_RUN_SQL_TESTS" ]; then From 1ff7c823b573f17f0523db1d2b5a42a0690518ee Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 12:46:43 -0700 Subject: [PATCH 12/15] Enable UTF-8 encoding. --- project/SparkBuild.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index d7e374558c5e2..a6298a71b15a5 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -149,7 +149,9 @@ object SparkBuild extends PomBuild { javacOptions in (Compile, doc) ++= { val Array(major, minor, _) = System.getProperty("java.version").split("\\.", 3) if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty - } + }, + + javacOptions in (Compile, doc) += "-encoding UTF-8" ) def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = { From a3b124db50291b00b41c12973fc67eefee0ff917 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 12:50:41 -0700 Subject: [PATCH 13/15] Try different UTF-8 encoded characters. --- .../org/apache/spark/unsafe/types/UTF8StringSuite.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index bf0197c645f49..80c179a1b5e75 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -60,7 +60,7 @@ public void startsWith() { Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))); Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))); Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo"))); - Assert.assertTrue(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千"))); + Assert.assertTrue(UTF8String.fromString("数据砖头").startsWith(UTF8String.fromString("数据"))); Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千"))); Assert.assertFalse( UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好"))); @@ -74,7 +74,7 @@ public void endsWith() { Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界"))); Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世"))); Assert.assertFalse( - UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("我的大千世界"))); + UTF8String.fromString("数据砖头").endsWith(UTF8String.fromString("我的数据砖头"))); } @Test @@ -84,10 +84,10 @@ public void substring() { Assert.assertEquals( UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el")); Assert.assertEquals( - UTF8String.fromString("大千世界").substring(0, 1), UTF8String.fromString("大")); + UTF8String.fromString("数据砖头").substring(0, 1), UTF8String.fromString("数")); Assert.assertEquals( - UTF8String.fromString("大千世界").substring(1, 3), UTF8String.fromString("千世")); + UTF8String.fromString("数据砖头").substring(1, 3), UTF8String.fromString("据砖")); Assert.assertEquals( - UTF8String.fromString("大千世界").substring(3, 5), UTF8String.fromString("界")); + UTF8String.fromString("数据砖头").substring(3, 5), UTF8String.fromString("头")); } } From 98e600bb8cca55ba370c29fd7bc0a43aee46ebc5 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 13:59:00 -0700 Subject: [PATCH 14/15] Another try with encoding setting .. --- project/SparkBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 3d4b583677053..79752db0a806b 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -151,7 +151,7 @@ object SparkBuild extends PomBuild { if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty }, - javacOptions in (Compile, doc) += "-encoding UTF-8" + javacOptions in Compile += "-encoding UTF-8" ) def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = { From 562dc6e8a4cdbb483c26dd7b02577b8cb35c4921 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 11 Jun 2015 14:12:26 -0700 Subject: [PATCH 15/15] Flag... --- project/SparkBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 79752db0a806b..41b7eba3a06c2 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -151,7 +151,7 @@ object SparkBuild extends PomBuild { if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty }, - javacOptions in Compile += "-encoding UTF-8" + javacOptions in Compile ++= Seq("-encoding", "UTF-8") ) def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {