apache · davies · Jul 17, 2015 · Jul 17, 2015 · Jul 20, 2015 · Jul 20, 2015
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
 import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
+import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.types.UTF8String;
 
 
@@ -345,7 +346,7 @@ public double getDouble(int i) {
    * This method is only supported on UnsafeRows that do not use ObjectPools.
    */
   @Override
-  public InternalRow copy() {
+  public UnsafeRow copy() {
     if (pool != null) {
       throw new UnsupportedOperationException(
         "Copy is not supported for UnsafeRows that use object pools");
@@ -365,8 +366,50 @@ public InternalRow copy() {
     }
   }
 
+  @Override
+  public int hashCode() {
+    return Murmur3_x86_32.hashUnsafeWords(baseObject, baseOffset, sizeInBytes, 42);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof UnsafeRow) {
+      UnsafeRow o = (UnsafeRow) other;
+      return ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
+        sizeInBytes);
+    }
+    return false;
+  }
+
+  /**
+   * Returns the underline bytes for this UnsafeRow.
+   */
+  public byte[] getBytes() {
+    if (baseObject instanceof byte[] && baseOffset == PlatformDependent.BYTE_ARRAY_OFFSET
+        && (((byte[]) baseObject).length == sizeInBytes)) {
+      return (byte[]) baseObject;
+    } else {
+      byte[] bytes = new byte[sizeInBytes];
+      PlatformDependent.copyMemory(baseObject, baseOffset, bytes,
+        PlatformDependent.BYTE_ARRAY_OFFSET, sizeInBytes);
+      return bytes;
+    }
+  }
+
+  // This is for debugging
+  @Override
+  public String toString(){
+    StringBuilder build = new StringBuilder("[");
+    for (int i = 0; i < sizeInBytes; i += 8) {
+      build.append(PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + i));
+      build.append(',');
+    }
+    build.append(']');
+    return build.toString();
+  }
+
   @Override
   public boolean anyNull() {
-    return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes);
+    return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes / 8);
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -28,11 +28,10 @@
 import org.apache.spark.TaskContext;
 import org.apache.spark.sql.AbstractScalaRowIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.UnsafeColumnWriter;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
-import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
 import org.apache.spark.util.collection.unsafe.sort.RecordComparator;
@@ -176,12 +175,7 @@ public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IO
    */
   public static boolean supportsSchema(StructType schema) {
     // TODO: add spilling note to explain why we do this for now:
-    for (StructField field : schema.fields()) {
-      if (!UnsafeColumnWriter.canEmbed(field.dataType())) {
-        return false;
-      }
-    }
-    return true;
+    return UnsafeProjection.canSupport(schema);
   }
 
   private static final class RowComparator extends RecordComparator {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -21,7 +21,6 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types._
 
 /**
@@ -34,7 +33,23 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def toString: String = s"input[$ordinal]"
 
-  override def eval(input: InternalRow): Any = input(ordinal)
+  // Use special getter for primitive types (for UnsafeRow)
+  override def eval(input: InternalRow): Any = {
+    if (input.isNullAt(ordinal)) {
+      null
+    } else {
+      dataType match {
+        case BooleanType => input.getBoolean(ordinal)
+        case ByteType => input.getByte(ordinal)
+        case ShortType => input.getShort(ordinal)
+        case IntegerType | DateType => input.getInt(ordinal)
+        case LongType | TimestampType => input.getLong(ordinal)
+        case FloatType => input.getFloat(ordinal)
+        case DoubleType => input.getDouble(ordinal)
+        case _ => input.get(ordinal)
+      }
+    }
+  }
 
   override def name: String = s"i[$ordinal]"
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -83,19 +83,32 @@ abstract class UnsafeProjection extends Projection {
 }
 
 object UnsafeProjection {
+  def canSupport(schema: StructType): Boolean = canSupport(schema.fields.map(_.dataType))
+  def canSupport(types: Seq[DataType]): Boolean = types.forall(UnsafeColumnWriter.canEmbed(_))
+
   def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType))
 
-  def create(fields: Seq[DataType]): UnsafeProjection = {
+  def create(fields: Array[DataType]): UnsafeProjection = {
     val exprs = fields.zipWithIndex.map(x => new BoundReference(x._2, x._1, true))
+    create(exprs)
+  }
+
+  def create(exprs: Seq[Expression]): UnsafeProjection = {
     GenerateUnsafeProjection.generate(exprs)
   }
+
+  def create(exprs: Seq[Expression], inputSchema: Seq[Attribute]): UnsafeProjection = {
+    create(exprs.map(BindReferences.bindReference(_, inputSchema)))
+  }
 }
 
 /**
  * A projection that could turn UnsafeRow into GenericInternalRow
  */
 case class FromUnsafeProjection(fields: Seq[DataType]) extends Projection {
 
+  def this(schema: StructType) = this(schema.fields.map(_.dataType))
+
   private[this] val expressions = fields.zipWithIndex.map { case (dt, idx) =>
     new BoundReference(idx, dt, true)
   }

diff --git a/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -111,7 +111,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
 /**
  * Function for writing a column into an UnsafeRow.
  */
-private abstract class UnsafeColumnWriter {
+abstract class UnsafeColumnWriter {
   /**
    * Write a value into an UnsafeRow.
    *
@@ -130,7 +130,7 @@ private abstract class UnsafeColumnWriter {
   def getSize(source: InternalRow, column: Int): Int
 }
 
-private object UnsafeColumnWriter {
+object UnsafeColumnWriter {
 
   def forType(dataType: DataType): UnsafeColumnWriter = {
     dataType match {

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -353,7 +353,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("FORMAT") {
     val f = 'f.string.at(0)
     val d1 = 'd.int.at(1)
-    val s1 = 's.int.at(2)
+    val s1 = 's.string.at(2)
 
     val row1 = create_row("aa%d%s", 12, "cc")
     val row2 = create_row(null, 12, "cc")

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -62,7 +62,7 @@ case class BroadcastHashJoin(
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
+    val hashed = buildHashRelation(input.iterator)
     sparkContext.broadcast(hashed)
   }(BroadcastHashJoin.broadcastHashJoinExecutionContext)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.concurrent._
+import scala.concurrent.duration._
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -26,10 +29,6 @@ import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.util.ThreadUtils
 
-import scala.collection.JavaConversions._
-import scala.concurrent._
-import scala.concurrent.duration._
-
 /**
  * :: DeveloperApi ::
  * Performs a outer hash join for two child relations.  When the output RDD of this operator is
@@ -58,28 +57,11 @@ case class BroadcastHashOuterJoin(
   override def requiredChildDistribution: Seq[Distribution] =
     UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
 
-  private[this] lazy val (buildPlan, streamedPlan) = joinType match {
-    case RightOuter => (left, right)
-    case LeftOuter => (right, left)
-    case x =>
-      throw new IllegalArgumentException(
-        s"BroadcastHashOuterJoin should not take $x as the JoinType")
-  }
-
-  private[this] lazy val (buildKeys, streamedKeys) = joinType match {
-    case RightOuter => (leftKeys, rightKeys)
-    case LeftOuter => (rightKeys, leftKeys)
-    case x =>
-      throw new IllegalArgumentException(
-        s"BroadcastHashOuterJoin should not take $x as the JoinType")
-  }
-
   @transient
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    // buildHashTable uses code-generated rows as keys, which are not serializable
-    val hashed = buildHashTable(input.iterator, newProjection(buildKeys, buildPlan.output))
+    val hashed = buildHashRelation(input.iterator)
     sparkContext.broadcast(hashed)
   }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext)
 
@@ -96,14 +78,14 @@ case class BroadcastHashOuterJoin(
           streamedIter.flatMap(currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withLeft(currentRow)
-            leftOuterIterator(rowKey, joinedRow, hashTable.getOrElse(rowKey, EMPTY_LIST))
+            leftOuterIterator(rowKey, joinedRow, hashTable.get(rowKey))
           })
 
         case RightOuter =>
           streamedIter.flatMap(currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withRight(currentRow)
-            rightOuterIterator(rowKey, hashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+            rightOuterIterator(rowKey, hashTable.get(rowKey), joinedRow)
           })
 
         case x =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -40,16 +40,15 @@ case class BroadcastLeftSemiJoinHash(
     val buildIter = right.execute().map(_.copy()).collect().toIterator
 
     if (condition.isEmpty) {
-      // rowKey may be not serializable (from codegen)
-      val hashSet = buildKeyHashSet(buildIter, copy = true)
+      val hashSet = buildKeyHashSet(buildIter)
       val broadcastedRelation = sparkContext.broadcast(hashSet)
 
       left.execute().mapPartitions { streamIter =>
         hashSemiJoin(streamIter, broadcastedRelation.value)
       }
     } else {
-      val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
-      val broadcastedRelation = sparkContext.broadcast(hashRelation)
+      val hashed = buildHashRelation(buildIter)
+      val broadcastedRelation = sparkContext.broadcast(hashed)
 
       left.execute().mapPartitions { streamIter =>
         hashSemiJoin(streamIter, broadcastedRelation.value)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -103,4 +103,16 @@ trait HashJoin {
       }
     }
   }
+
+  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
+    if (self.codegenEnabled && UnsafeProjection.canSupport(buildKeys.map(_.dataType))
+        && UnsafeProjection.canSupport(buildPlan.output.map(_.dataType))) {
+      UnsafeHashedRelation(
+        buildIter,
+        buildKeys.map(BindReferences.bindReference(_, buildPlan.output)),
+        buildPlan.schema)
+    } else {
+      HashedRelation(buildIter, buildSideKeyGenerator)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -38,7 +38,7 @@ trait HashOuterJoin {
   val left: SparkPlan
   val right: SparkPlan
 
-override def outputPartitioning: Partitioning = joinType match {
+  override def outputPartitioning: Partitioning = joinType match {
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
     case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
@@ -59,6 +59,30 @@ override def outputPartitioning: Partitioning = joinType match {
     }
   }
 
+  protected[this] lazy val (buildPlan, streamedPlan) = joinType match {
+    case RightOuter => (left, right)
+    case LeftOuter => (right, left)
+    case x =>
+      throw new IllegalArgumentException(
+        s"BroadcastHashOuterJoin should not take $x as the JoinType")
+  }
+
+  protected[this] lazy val (buildKeys, streamedKeys) = joinType match {
+    case RightOuter => (leftKeys, rightKeys)
+    case LeftOuter => (rightKeys, leftKeys)
+    case x =>
+      throw new IllegalArgumentException(
+        s"BroadcastHashOuterJoin should not take $x as the JoinType")
+  }
+
+  protected[this] def streamedKeyGenerator(): Projection = {
+    if (self.codegenEnabled && UnsafeProjection.canSupport(streamedKeys.map(_.dataType))) {
+      UnsafeProjection.create(streamedKeys, streamedPlan.output)
+    } else {
+      newProjection(streamedKeys, streamedPlan.output)
+    }
+  }
+
   @transient private[this] lazy val DUMMY_LIST = CompactBuffer[InternalRow](null)
   @transient protected[this] lazy val EMPTY_LIST = CompactBuffer[InternalRow]()
 
@@ -76,8 +100,12 @@ override def outputPartitioning: Partitioning = joinType match {
       rightIter: Iterable[InternalRow]): Iterator[InternalRow] = {
     val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
-        val temp = rightIter.collect {
-          case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
+        val temp = if (rightIter != null) {
+          rightIter.collect {
+            case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
+          }
+        } else {
+          List()
         }
         if (temp.isEmpty) {
           joinedRow.withRight(rightNullRow).copy :: Nil
@@ -97,9 +125,13 @@ override def outputPartitioning: Partitioning = joinType match {
       joinedRow: JoinedRow): Iterator[InternalRow] = {
     val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
-        val temp = leftIter.collect {
-          case l if boundCondition(joinedRow.withLeft(l)) =>
-            joinedRow.copy()
+        val temp = if (leftIter != null) {
+          leftIter.collect {
+            case l if boundCondition(joinedRow.withLeft(l)) =>
+              joinedRow.copy()
+          }
+        } else {
+          List()
         }
         if (temp.isEmpty) {
           joinedRow.withLeft(leftNullRow).copy :: Nil
@@ -178,4 +210,16 @@ override def outputPartitioning: Partitioning = joinType match {
 
     hashTable
   }
+
+  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
+    if (self.codegenEnabled && UnsafeProjection.canSupport(buildKeys.map(_.dataType))
+        && UnsafeProjection.canSupport(buildPlan.output.map(_.dataType))) {
+      UnsafeHashedRelation(
+        buildIter,
+        buildKeys.map(BindReferences.bindReference(_, buildPlan.output)),
+        buildPlan.schema)
+    } else {
+      HashedRelation(buildIter, newProjection(buildKeys, buildPlan.output))
+    }
+  }
 }