From b56b950f6b4e88e590706f48ec32b01d00bf29d7 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Mon, 15 Dec 2014 10:50:01 -0800
Subject: [PATCH 01/25] Initial stub

---
 .../apache/spark/rdd/PairRDDFunctions.scala   | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index c43e1f2fe135e..6f3a18f4089c0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -21,6 +21,8 @@ import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
 import java.util.{Date, HashMap => JHashMap}
 
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
+
 import scala.collection.{Map, mutable}
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
@@ -853,6 +855,30 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
   }
 
+  class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
+    override def generateActualKey(key: Any, value: Any): Any =
+      NullWritable.get()
+
+    override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
+      key.asInstanceOf[String]
+  }
+
+  /**
+   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop 
+   * `OutputFormat` class supporting the key and value types K and V in this RDD.
+   * 
+   * Example:
+   * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')]
+   * /path/prefix/B [/part-1, /part-2, etc]
+   * /path/prefix/F [/part-1, /part-2, etc]
+   * /path/prefix/N [/part-1, /part-2, etc]
+   */
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
+    val paths = this.keys.map("path/" + "key_" + _.toString)
+    
+    saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
+  }
+
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD. Compress the result with the

From 757ab3ded6892a840e925c4c59c5061be091023d Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Sun, 28 Dec 2014 11:35:33 -0500
Subject: [PATCH 02/25] Updating tests

---
 .../org/apache/spark/rdd/PairRDDFunctions.scala   | 12 ++++++++----
 .../apache/spark/rdd/PairRDDFunctionsSuite.scala  | 15 +++++++++++----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 6f3a18f4089c0..b62c8df3882e5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
 import java.util.{Date, HashMap => JHashMap}
 
+import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
 
 import scala.collection.{Map, mutable}
@@ -873,12 +874,15 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * /path/prefix/F [/part-1, /part-2, etc]
    * /path/prefix/N [/part-1, /part-2, etc]
    */
-  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
-    val paths = this.keys.map("path/" + "key_" + _.toString)
-    
-    saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
+                                                    (implicit fm: ClassTag[F]) {
+    partitionBy(new HashPartitioner(numPartitions)).
+      saveAsHadoopFileByKey(path)
   }
 
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
+    saveAsHadoopFile(path, keyClass, valueClass, classOf[RDDMultipleTextOutputFormat])
+  }
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD. Compress the result with the
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 108f70af43f37..dfbca76f490e4 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -17,23 +17,30 @@
 
 package org.apache.spark.rdd
 
-import org.apache.hadoop.fs.FileSystem
-import org.apache.hadoop.mapred._
-import org.apache.hadoop.util.Progressable
-
 import scala.collection.mutable.{ArrayBuffer, HashSet}
+import scala.sys.process._
 import scala.util.Random
 
 import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.fs.{Path, FileSystem}
+import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter,
 OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter,
 TaskAttemptContext => NewTaskAttempContext}
 import org.apache.spark.{Partitioner, SharedSparkContext}
 import org.apache.spark.util.Utils
+import org.apache.hadoop.util.Progressable
 
 import org.scalatest.FunSuite
 
 class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
+  test("saveAsHadoopFileByKey should generate a text file per key") {
+    val pairs = sc.parallelize((1 to 20).zipWithIndex)
+    val conf = new JobConf()
+    
+    pairs.saveAsHadoopFileByKey("testPath")
+  }
+
   test("aggregateByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)
 

From 37132457688935c2791c9c9701a7b5af4ff91dac Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Mon, 5 Jan 2015 16:17:31 -0500
Subject: [PATCH 03/25] Test still failing  during reflection processing in
 hadoop utils

---
 .../org/apache/spark/rdd/PairRDDFunctions.scala | 17 ++++++++++++++++-
 .../spark/rdd/PairRDDFunctionsSuite.scala       |  9 ++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 6ae6d38b51048..399cccad2440e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
 import java.util.{Date, HashMap => JHashMap}
 
+import org.apache.commons.lang.ClassUtils
 import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
 
@@ -860,14 +861,24 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
+    def init() = {
+      println("Initializing multiple text output format saver")
+    }
+    
     override def generateActualKey(key: Any, value: Any): Any =
+    {
       NullWritable.get()
+    }
 
     override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
+    {
       key.asInstanceOf[String]
+    }
   }
 
   /**
+   * TODO: This only works if the key is a java Object (can't work with primitive types)
+   * 
    * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop 
    * `OutputFormat` class supporting the key and value types K and V in this RDD.
    * 
@@ -879,13 +890,17 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
                                                     (implicit fm: ClassTag[F]) {
+    
     partitionBy(new HashPartitioner(numPartitions)).
       saveAsHadoopFileByKey(path)
   }
 
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
-    saveAsHadoopFile(path, keyClass, valueClass, classOf[RDDMultipleTextOutputFormat])
+    saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), 
+      ClassUtils.primitiveToWrapper(valueClass), 
+      classOf[RDDMultipleTextOutputFormat])
   }
+  
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD. Compress the result with the
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index dfbca76f490e4..0568c475a9171 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -36,9 +36,12 @@ import org.scalatest.FunSuite
 class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
   test("saveAsHadoopFileByKey should generate a text file per key") {
     val pairs = sc.parallelize((1 to 20).zipWithIndex)
-    val conf = new JobConf()
-    
-    pairs.saveAsHadoopFileByKey("testPath")
+    val fs = FileSystem.get(new Configuration())
+    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+    val fullPath = basePath + "testPath"
+    fs.delete(new Path(fullPath), true)
+    pairs.saveAsHadoopFileByKey(fullPath)
+    fs.delete(new Path(fullPath), true)
   }
 
   test("aggregateByKey") {

From 5e615a2b734abbe74bac6a8ef83335c0c392ab41 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 6 Jan 2015 11:09:35 -0500
Subject: [PATCH 04/25] Added init function to try to resolve reflection error

---
 .../src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 399cccad2440e..9da1ff4d12f09 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -861,7 +861,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
-    def init() = {
+    def init() : Unit = {
       println("Initializing multiple text output format saver")
     }
     
@@ -890,7 +890,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
                                                     (implicit fm: ClassTag[F]) {
-    
     partitionBy(new HashPartitioner(numPartitions)).
       saveAsHadoopFileByKey(path)
   }

From 3defe515ff7e337945f7687010e56246498c7ada Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Thu, 15 Jan 2015 14:59:59 -0800
Subject: [PATCH 05/25] Attempting fix

---
 .../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala  | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 9da1ff4d12f09..2411fa1bbbdf9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -860,11 +860,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
   }
 
-  class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
-    def init() : Unit = {
-      println("Initializing multiple text output format saver")
-    }
-    
+  class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any]() {
     override def generateActualKey(key: Any, value: Any): Any =
     {
       NullWritable.get()

From a8199f6dd4a7518816d9be32963647d27aa282b2 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 4 Mar 2015 13:24:39 -0800
Subject: [PATCH 06/25] Updated to fix init bug

---
 .../apache/spark/rdd/PairRDDFunctions.scala   |  17 +-
 .../apache/spark/sql/UDFRegistration.scala    | 571 ------------------
 2 files changed, 9 insertions(+), 579 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 0db868a9d417d..6d0eeb24b605d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -323,7 +323,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   @deprecated("Use reduceByKeyLocally", "1.0.0")
   def reduceByKeyToDriver(func: (V, V) => V): Map[K, V] = reduceByKeyLocally(func)
 
-  /** 
+  /**
    * Count the number of elements for each key, collecting the results to a local Map.
    *
    * Note that this method should only be used if the resulting map is expected to be small, as
@@ -867,7 +867,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
   }
 
-  class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any]() {
+  class RDDMultipleTextOutputFormat() extends MultipleTextOutputFormat[Any, Any]() {
+
     override def generateActualKey(key: Any, value: Any): Any =
     {
       NullWritable.get()
@@ -881,10 +882,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
   /**
    * TODO: This only works if the key is a java Object (can't work with primitive types)
-   * 
-   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop 
+   *
+   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
    * `OutputFormat` class supporting the key and value types K and V in this RDD.
-   * 
+   *
    * Example:
    * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')]
    * /path/prefix/B [/part-1, /part-2, etc]
@@ -898,11 +899,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
-    saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), 
-      ClassUtils.primitiveToWrapper(valueClass), 
+    saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass),
+      ClassUtils.primitiveToWrapper(valueClass),
       classOf[RDDMultipleTextOutputFormat])
   }
-  
+
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD. Compress the result with the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
deleted file mode 100644
index 8051df299252c..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.util.{List => JList, Map => JMap}
-
-import scala.reflect.runtime.universe.TypeTag
-
-import org.apache.spark.{Accumulator, Logging}
-import org.apache.spark.api.python.PythonBroadcast
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.sql.api.java._
-import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
-import org.apache.spark.sql.execution.PythonUDF
-import org.apache.spark.sql.types.DataType
-
-
-/**
- * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
- */
-class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
-
-  private val functionRegistry = sqlContext.functionRegistry
-
-  protected[sql] def registerPython(
-      name: String,
-      command: Array[Byte],
-      envVars: JMap[String, String],
-      pythonIncludes: JList[String],
-      pythonExec: String,
-      broadcastVars: JList[Broadcast[PythonBroadcast]],
-      accumulator: Accumulator[JList[Array[Byte]]],
-      stringDataType: String): Unit = {
-    log.debug(
-      s"""
-        | Registering new PythonUDF:
-        | name: $name
-        | command: ${command.toSeq}
-        | envVars: $envVars
-        | pythonIncludes: $pythonIncludes
-        | pythonExec: $pythonExec
-        | dataType: $stringDataType
-      """.stripMargin)
-
-
-    val dataType = sqlContext.parseDataType(stringDataType)
-
-    def builder(e: Seq[Expression]) =
-      PythonUDF(
-        name,
-        command,
-        envVars,
-        pythonIncludes,
-        pythonExec,
-        broadcastVars,
-        accumulator,
-        dataType,
-        e)
-
-    functionRegistry.registerFunction(name, builder)
-  }
-
-  // scalastyle:off
-
-  /* register 0-22 were generated by this script
-
-    (0 to 22).map { x =>
-      val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
-      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
-      println(s"""
-        /**
-         * Register a Scala closure of ${x} arguments as user-defined function (UDF).
-         * @tparam RT return type of UDF.
-         */
-        def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
-          val dataType = ScalaReflection.schemaFor[RT].dataType
-          def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-          functionRegistry.registerFunction(name, builder)
-          UserDefinedFunction(func, dataType)
-        }""")
-    }
-
-    (1 to 22).foreach { i =>
-      val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
-      val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
-      val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
-      val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
-      println(s"""
-         |/**
-         | * Register a user-defined function with ${i} arguments.
-         | */
-         |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType) = {
-         |  functionRegistry.registerFunction(
-         |    name,
-         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), returnType, e))
-         |}""".stripMargin)
-    }
-    */
-
-  /**
-   * Register a Scala closure of 0 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 1 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 2 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 3 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 4 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 5 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 6 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 7 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 8 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 9 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 10 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 11 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 12 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 13 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 14 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 15 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 16 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 17 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 18 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 19 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 20 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 21 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  /**
-   * Register a Scala closure of 22 arguments as user-defined function (UDF).
-   * @tparam RT return type of UDF.
-   */
-  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = {
-    val dataType = ScalaReflection.schemaFor[RT].dataType
-    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////
-  //////////////////////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * Register a user-defined function with 1 arguments.
-   */
-  def register(name: String, f: UDF1[_, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 2 arguments.
-   */
-  def register(name: String, f: UDF2[_, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 3 arguments.
-   */
-  def register(name: String, f: UDF3[_, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 4 arguments.
-   */
-  def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 5 arguments.
-   */
-  def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 6 arguments.
-   */
-  def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 7 arguments.
-   */
-  def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 8 arguments.
-   */
-  def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 9 arguments.
-   */
-  def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 10 arguments.
-   */
-  def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 11 arguments.
-   */
-  def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 12 arguments.
-   */
-  def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 13 arguments.
-   */
-  def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 14 arguments.
-   */
-  def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 15 arguments.
-   */
-  def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 16 arguments.
-   */
-  def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 17 arguments.
-   */
-  def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 18 arguments.
-   */
-  def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 19 arguments.
-   */
-  def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 20 arguments.
-   */
-  def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 21 arguments.
-   */
-  def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  /**
-   * Register a user-defined function with 22 arguments.
-   */
-  def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
-    functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
-  }
-
-  // scalastyle:on
-}

From aa1e6dcb8f9711a08850648fbc7bbe3caf760723 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 4 Mar 2015 14:03:07 -0800
Subject: [PATCH 07/25] Restored lost UDF Reg

---
 .../apache/spark/sql/UDFRegistration.scala    | 571 ++++++++++++++++++
 1 file changed, 571 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
new file mode 100644
index 0000000000000..8051df299252c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -0,0 +1,571 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.{List => JList, Map => JMap}
+
+import scala.reflect.runtime.universe.TypeTag
+
+import org.apache.spark.{Accumulator, Logging}
+import org.apache.spark.api.python.PythonBroadcast
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.api.java._
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
+import org.apache.spark.sql.execution.PythonUDF
+import org.apache.spark.sql.types.DataType
+
+
+/**
+ * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
+ */
+class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
+
+  private val functionRegistry = sqlContext.functionRegistry
+
+  protected[sql] def registerPython(
+      name: String,
+      command: Array[Byte],
+      envVars: JMap[String, String],
+      pythonIncludes: JList[String],
+      pythonExec: String,
+      broadcastVars: JList[Broadcast[PythonBroadcast]],
+      accumulator: Accumulator[JList[Array[Byte]]],
+      stringDataType: String): Unit = {
+    log.debug(
+      s"""
+        | Registering new PythonUDF:
+        | name: $name
+        | command: ${command.toSeq}
+        | envVars: $envVars
+        | pythonIncludes: $pythonIncludes
+        | pythonExec: $pythonExec
+        | dataType: $stringDataType
+      """.stripMargin)
+
+
+    val dataType = sqlContext.parseDataType(stringDataType)
+
+    def builder(e: Seq[Expression]) =
+      PythonUDF(
+        name,
+        command,
+        envVars,
+        pythonIncludes,
+        pythonExec,
+        broadcastVars,
+        accumulator,
+        dataType,
+        e)
+
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  // scalastyle:off
+
+  /* register 0-22 were generated by this script
+
+    (0 to 22).map { x =>
+      val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
+      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
+      println(s"""
+        /**
+         * Register a Scala closure of ${x} arguments as user-defined function (UDF).
+         * @tparam RT return type of UDF.
+         */
+        def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
+          val dataType = ScalaReflection.schemaFor[RT].dataType
+          def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+          functionRegistry.registerFunction(name, builder)
+          UserDefinedFunction(func, dataType)
+        }""")
+    }
+
+    (1 to 22).foreach { i =>
+      val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
+      val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
+      val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
+      val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+      println(s"""
+         |/**
+         | * Register a user-defined function with ${i} arguments.
+         | */
+         |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType) = {
+         |  functionRegistry.registerFunction(
+         |    name,
+         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), returnType, e))
+         |}""".stripMargin)
+    }
+    */
+
+  /**
+   * Register a Scala closure of 0 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 1 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 2 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 3 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 4 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 5 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 6 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 7 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 8 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 9 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 10 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 11 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 12 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 13 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 14 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 15 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 16 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 17 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 18 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 19 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 20 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 21 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  /**
+   * Register a Scala closure of 22 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = {
+    val dataType = ScalaReflection.schemaFor[RT].dataType
+    def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e)
+    functionRegistry.registerFunction(name, builder)
+    UserDefinedFunction(func, dataType)
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Register a user-defined function with 1 arguments.
+   */
+  def register(name: String, f: UDF1[_, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 2 arguments.
+   */
+  def register(name: String, f: UDF2[_, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 3 arguments.
+   */
+  def register(name: String, f: UDF3[_, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 4 arguments.
+   */
+  def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 5 arguments.
+   */
+  def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 6 arguments.
+   */
+  def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 7 arguments.
+   */
+  def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 8 arguments.
+   */
+  def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 9 arguments.
+   */
+  def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 10 arguments.
+   */
+  def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 11 arguments.
+   */
+  def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 12 arguments.
+   */
+  def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 13 arguments.
+   */
+  def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 14 arguments.
+   */
+  def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 15 arguments.
+   */
+  def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 16 arguments.
+   */
+  def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 17 arguments.
+   */
+  def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 18 arguments.
+   */
+  def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 19 arguments.
+   */
+  def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 20 arguments.
+   */
+  def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 21 arguments.
+   */
+  def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 22 arguments.
+   */
+  def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  // scalastyle:on
+}

From ab6d8cb420e2b366665234ff5e3de114fc287261 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Fri, 6 Mar 2015 09:54:35 -0800
Subject: [PATCH 08/25] Playing around with class

---
 .../scala/org/apache/spark/rdd/PairRDDFunctions.scala  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 062d88fddd7e9..80ea31d897db4 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -867,14 +867,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
   }
 
-  class RDDMultipleTextOutputFormat() extends MultipleTextOutputFormat[Any, Any]() {
+  class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K, V]() {
 
-    override def generateActualKey(key: Any, value: Any): Any =
+    override def generateActualKey(key: K, value: V): K =
     {
-      NullWritable.get()
+      NullWritable.get().asInstanceOf[K]
     }
 
-    override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
+    override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
     {
       key.asInstanceOf[String]
     }
@@ -901,7 +901,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
     saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass),
       ClassUtils.primitiveToWrapper(valueClass),
-      classOf[RDDMultipleTextOutputFormat])
+      classOf[RDDMultipleTextOutputFormat[K,V]])
   }
 
   /**

From 97e4a630e604b7c79ef1dd5cfdc673be5ac2407f Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 10 Mar 2015 14:40:34 -0700
Subject: [PATCH 09/25] Got around .<init>() error by moving
 RDDMultipleTextOutputFormat out of PairRDDFunctions class (so it's no longer
 an inner class

---
 .../apache/spark/rdd/PairRDDFunctions.scala   | 33 ++++++++--------
 .../java/org/apache/spark/JavaAPISuite.java   | 38 -------------------
 2 files changed, 15 insertions(+), 56 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 80ea31d897db4..65a27bd8d0729 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -52,6 +52,18 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.CompactBuffer
 import org.apache.spark.util.random.StratifiedSamplingUtils
 
+class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() {
+  override def generateActualKey(key: K, value: V): K =
+  {
+    NullWritable.get().asInstanceOf[K]
+  }
+
+  override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
+  {
+    key.asInstanceOf[String]
+  }
+}
+
 /**
  * Extra functions available on RDDs of (key, value) pairs through an implicit conversion.
  */
@@ -867,22 +879,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]])
   }
 
-  class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K, V]() {
-
-    override def generateActualKey(key: K, value: V): K =
-    {
-      NullWritable.get().asInstanceOf[K]
-    }
-
-    override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
-    {
-      key.asInstanceOf[String]
-    }
-  }
-
-  /**
-   * TODO: This only works if the key is a java Object (can't work with primitive types)
-   *
+  /*
    * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
    * `OutputFormat` class supporting the key and value types K and V in this RDD.
    *
@@ -894,8 +891,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
                                                     (implicit fm: ClassTag[F]) {
-    partitionBy(new HashPartitioner(numPartitions)).
-      saveAsHadoopFileByKey(path)
+    partitionBy(new HashPartitioner(numPartitions))
+      .saveAsHadoopFileByKey(path)
   }
 
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 74e88c767ee07..098f446fc65e6 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1581,42 +1581,4 @@ public void testAsyncActionErrorWrapping() throws Exception {
     Assert.assertTrue(future.isDone());
   }
 
-
-  /**
-   * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue,
-   * since that's the only artifact where Guava classes have been relocated.
-   */
-  @Test
-  public void testGuavaOptional() {
-    // Stop the context created in setUp() and start a local-cluster one, to force usage of the
-    // assembly.
-    sc.stop();
-    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite");
-    try {
-      JavaRDD<Integer> rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3);
-      JavaRDD<Optional<Integer>> rdd2 = rdd1.map(
-        new Function<Integer, Optional<Integer>>() {
-          @Override
-          public Optional<Integer> call(Integer i) {
-            return Optional.fromNullable(i);
-          }
-        });
-      rdd2.collect();
-    } finally {
-      localCluster.stop();
-    }
-  }
-
-  static class Class1 {}
-  static class Class2 {}
-
-  @Test
-  public void testRegisterKryoClasses() {
-    SparkConf conf = new SparkConf();
-    conf.registerKryoClasses(new Class<?>[]{ Class1.class, Class2.class });
-    Assert.assertEquals(
-        Class1.class.getName() + "," + Class2.class.getName(),
-        conf.get("spark.kryo.classesToRegister"));
-  }
-
 }

From 5d968861bc8a973289b2a9e3737dabf4ed174285 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 10 Mar 2015 16:57:45 -0700
Subject: [PATCH 10/25] Tests pass. Need to add check for file creation

---
 .../org/apache/spark/rdd/PairRDDFunctions.scala      |  2 +-
 .../src/test/java/org/apache/spark/JavaAPISuite.java | 12 ++++++++++++
 .../org/apache/spark/rdd/PairRDDFunctionsSuite.scala |  7 ++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 65a27bd8d0729..9d95136b16739 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -60,7 +60,7 @@ class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V](
 
   override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
   {
-    key.asInstanceOf[String]
+    key.toString()
   }
 }
 
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 098f446fc65e6..3c705301ad266 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1581,4 +1581,16 @@ public void testAsyncActionErrorWrapping() throws Exception {
     Assert.assertTrue(future.isDone());
   }
 
+  static class Class1 {}
+  static class Class2 {}
+
+  @Test
+  public void testRegisterKryoClasses() {
+    SparkConf conf = new SparkConf();
+    conf.registerKryoClasses(new Class<?>[]{ Class1.class, Class2.class });
+    Assert.assertEquals(
+        Class1.class.getName() + "," + Class2.class.getName(),
+        conf.get("spark.kryo.classesToRegister"));
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 0568c475a9171..bf5306f304a2c 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -35,12 +35,17 @@ import org.scalatest.FunSuite
 
 class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
   test("saveAsHadoopFileByKey should generate a text file per key") {
-    val pairs = sc.parallelize((1 to 20).zipWithIndex)
+    val keys = 1 to 20
+    val pairs = sc.parallelize(keys.zipWithIndex)
     val fs = FileSystem.get(new Configuration())
     val basePath = sc.conf.get("spark.local.dir", "/tmp")
     val fullPath = basePath + "testPath"
     fs.delete(new Path(fullPath), true)
     pairs.saveAsHadoopFileByKey(fullPath)
+
+    // Test that a file was created for each key
+
+
     fs.delete(new Path(fullPath), true)
   }
 

From fa07ad0f7c6385e11121c15025f602d1d3eb36c1 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 10 Mar 2015 17:12:30 -0700
Subject: [PATCH 11/25] Removing broken test

---
 .../java/org/apache/spark/JavaAPISuite.java   | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 3c705301ad266..debc44d6aaa79 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1581,6 +1581,32 @@ public void testAsyncActionErrorWrapping() throws Exception {
     Assert.assertTrue(future.isDone());
   }
 
+//
+//  /**
+//   * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue,
+//   * since that's the only artifact where Guava classes have been relocated.
+//   */
+//  @Test
+//  public void testGuavaOptional() {
+//    // Stop the context created in setUp() and start a local-cluster one, to force usage of the
+//    // assembly.
+//    sc.stop();
+//    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite");
+//    try {
+//      JavaRDD<Integer> rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3);
+//      JavaRDD<Optional<Integer>> rdd2 = rdd1.map(
+//        new Function<Integer, Optional<Integer>>() {
+//          @Override
+//          public Optional<Integer> call(Integer i) {
+//            return Optional.fromNullable(i);
+//          }
+//        });
+//      rdd2.collect();
+//    } finally {
+//      localCluster.stop();
+//    }
+//  }
+
   static class Class1 {}
   static class Class2 {}
 

From 5438079df922b690226db44e1277d0c0e2e45454 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 11 Mar 2015 12:03:19 -0700
Subject: [PATCH 12/25] Added test to validate file creation

---
 .../spark/rdd/PairRDDFunctionsSuite.scala      | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index bf5306f304a2c..b2e93697c1f2b 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.rdd
 
+import java.io.{BufferedReader, InputStreamReader}
+
 import scala.collection.mutable.{ArrayBuffer, HashSet}
 import scala.sys.process._
 import scala.util.Random
 
 import org.apache.hadoop.conf.{Configurable, Configuration}
-import org.apache.hadoop.fs.{Path, FileSystem}
+import org.apache.hadoop.fs.{FSDataInputStream, Path, FileSystem}
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter,
 OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter,
@@ -36,15 +38,23 @@ import org.scalatest.FunSuite
 class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
   test("saveAsHadoopFileByKey should generate a text file per key") {
     val keys = 1 to 20
-    val pairs = sc.parallelize(keys.zipWithIndex)
+    val pairs = sc.parallelize(keys).map(s => (s, s*s))
     val fs = FileSystem.get(new Configuration())
     val basePath = sc.conf.get("spark.local.dir", "/tmp")
-    val fullPath = basePath + "testPath"
+    val fullPath = basePath + "/testPath"
     fs.delete(new Path(fullPath), true)
     pairs.saveAsHadoopFileByKey(fullPath)
 
     // Test that a file was created for each key
-
+    keys.foreach(key => {
+      val testPath = new Path(fullPath + "/" + key)
+      assert(fs.exists(testPath))
+
+      // Read the file and test that the contents are the value
+      val input = fs.open(testPath)
+      val firstLine = new BufferedReader(new InputStreamReader(input)).readLine()
+      assert(firstLine.toInt.equals(key*key))
+    })
 
     fs.delete(new Path(fullPath), true)
   }

From e68386cce31c4d3707bbae21a83e28f4d82fcc2e Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 11 Mar 2015 12:10:26 -0700
Subject: [PATCH 13/25] Moved text formatter to its own class

---
 .../apache/spark/rdd/PairRDDFunctions.scala   | 12 -------
 .../rdd/RDDMultipleTextOutputFormat.scala     | 36 +++++++++++++++++++
 2 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 9d95136b16739..0a064e4d86f83 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -52,18 +52,6 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.CompactBuffer
 import org.apache.spark.util.random.StratifiedSamplingUtils
 
-class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() {
-  override def generateActualKey(key: K, value: V): K =
-  {
-    NullWritable.get().asInstanceOf[K]
-  }
-
-  override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
-  {
-    key.toString()
-  }
-}
-
 /**
  * Extra functions available on RDDs of (key, value) pairs through an implicit conversion.
  */
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala b/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala
new file mode 100644
index 0000000000000..ebe0eae226aee
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rdd
+
+import org.apache.hadoop.io.NullWritable
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
+
+/**
+ * This class is used by the PairRDDFunctions class to facilitate writing out a key-value RDD
+ * to multiple files, organized by the keys.
+ */
+class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() {
+  override def generateActualKey(key: K, value: V): K =
+  {
+    NullWritable.get().asInstanceOf[K]
+  }
+
+  override def generateFileNameForKeyValue(key: K, value: V, name: String): String =
+  {
+    key.toString()
+  }
+}

From 21247a799851fa4a744335555df2a103935107ac Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 11 Mar 2015 12:32:53 -0700
Subject: [PATCH 14/25] Extended tests to write multiple values to file and
 ensure that they're split by line by key

---
 .../spark/rdd/PairRDDFunctionsSuite.scala      | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index b2e93697c1f2b..ec857bef862fa 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -38,7 +38,13 @@ import org.scalatest.FunSuite
 class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
   test("saveAsHadoopFileByKey should generate a text file per key") {
     val keys = 1 to 20
-    val pairs = sc.parallelize(keys).map(s => (s, s*s))
+    val testValues = 1 to 5
+    // Generate the cartesian product of keys by test values
+    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
+      kv._2.map(v => (kv._1, v*kv._1))
+    })
+
+    val pairs = sc.parallelize(pairsLocal)
     val fs = FileSystem.get(new Configuration())
     val basePath = sc.conf.get("spark.local.dir", "/tmp")
     val fullPath = basePath + "/testPath"
@@ -50,10 +56,14 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
       val testPath = new Path(fullPath + "/" + key)
       assert(fs.exists(testPath))
 
-      // Read the file and test that the contents are the value
+      // Read the file and test that the contents are the values matching that key split by line
       val input = fs.open(testPath)
-      val firstLine = new BufferedReader(new InputStreamReader(input)).readLine()
-      assert(firstLine.toInt.equals(key*key))
+      val reader = new BufferedReader(new InputStreamReader(input))
+      val values = new HashSet[Int]
+      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+      lines.foreach(s => values += s.toInt)
+
+      testValues.foreach(v => assert(values.contains(v*key)))
     })
 
     fs.delete(new Path(fullPath), true)

From 4ba633996fdd8b63bfdbf3debc083b1ff03af419 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Thu, 12 Mar 2015 10:19:46 -0700
Subject: [PATCH 15/25] Restored code in Java API suite/

---
 .../java/org/apache/spark/JavaAPISuite.java   | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index debc44d6aaa79..74e88c767ee07 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1581,31 +1581,31 @@ public void testAsyncActionErrorWrapping() throws Exception {
     Assert.assertTrue(future.isDone());
   }
 
-//
-//  /**
-//   * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue,
-//   * since that's the only artifact where Guava classes have been relocated.
-//   */
-//  @Test
-//  public void testGuavaOptional() {
-//    // Stop the context created in setUp() and start a local-cluster one, to force usage of the
-//    // assembly.
-//    sc.stop();
-//    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite");
-//    try {
-//      JavaRDD<Integer> rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3);
-//      JavaRDD<Optional<Integer>> rdd2 = rdd1.map(
-//        new Function<Integer, Optional<Integer>>() {
-//          @Override
-//          public Optional<Integer> call(Integer i) {
-//            return Optional.fromNullable(i);
-//          }
-//        });
-//      rdd2.collect();
-//    } finally {
-//      localCluster.stop();
-//    }
-//  }
+
+  /**
+   * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue,
+   * since that's the only artifact where Guava classes have been relocated.
+   */
+  @Test
+  public void testGuavaOptional() {
+    // Stop the context created in setUp() and start a local-cluster one, to force usage of the
+    // assembly.
+    sc.stop();
+    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite");
+    try {
+      JavaRDD<Integer> rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3);
+      JavaRDD<Optional<Integer>> rdd2 = rdd1.map(
+        new Function<Integer, Optional<Integer>>() {
+          @Override
+          public Optional<Integer> call(Integer i) {
+            return Optional.fromNullable(i);
+          }
+        });
+      rdd2.collect();
+    } finally {
+      localCluster.stop();
+    }
+  }
 
   static class Class1 {}
   static class Class2 {}

From bd79fd9d4bccb7ef9de07dba194ba7fa49137c87 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Mon, 16 Mar 2015 22:57:55 -0700
Subject: [PATCH 16/25] Added python RDD saveByKey and test for Python
 SaveByKey

---
 .../apache/spark/api/java/JavaPairRDD.scala   | 488 +++++++++---------
 .../apache/spark/api/python/PythonRDD.scala   |  44 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |   3 +
 .../spark/api/python/PythonRDDSuite.scala     |  54 +-
 4 files changed, 353 insertions(+), 236 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 7af3538262fd6..ae54b94c4855c 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -742,270 +742,296 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    */
   def lookup(key: K): JList[V] = seqAsJavaList(rdd.lookup(key))
 
-  /** Output the RDD to any Hadoop-supported file system. */
-  def saveAsHadoopFile[F <: OutputFormat[_, _]](
-      path: String,
-      keyClass: Class[_],
-      valueClass: Class[_],
-      outputFormatClass: Class[F],
-      conf: JobConf) {
-    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
+  /*
+   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
+   * `OutputFormat` class supporting the key and value types K and V in this RDD.
+   *
+   * Example:
+   * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')]
+   * /path/prefix/B [/part-1, /part-2, etc]
+   * /path/prefix/F [/part-1, /part-2, etc]
+   * /path/prefix/N [/part-1, /part-2, etc]
+   *
+   * @param path - The path for the parent directory
+   * @param numPartitions - The number of partitions to partition to
+   */
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
+      (implicit fm: ClassTag[F]) {
+    rdd.saveAsHadoopFileByKey(path, numPartitions)
   }
 
-  /** Output the RDD to any Hadoop-supported file system. */
-  def saveAsHadoopFile[F <: OutputFormat[_, _]](
-      path: String,
-      keyClass: Class[_],
-      valueClass: Class[_],
-      outputFormatClass: Class[F]) {
-    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass)
+  /*
+   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
+   * `OutputFormat` class supporting the key and value types K and V in this RDD.
+   */
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
+    rdd.saveAsHadoopFileByKey(path)
   }
 
-  /** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */
-  def saveAsHadoopFile[F <: OutputFormat[_, _]](
-      path: String,
-      keyClass: Class[_],
-      valueClass: Class[_],
-      outputFormatClass: Class[F],
-      codec: Class[_ <: CompressionCodec]) {
-    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec)
-  }
+/** Output the RDD to any Hadoop-supported file system. */
+def saveAsHadoopFile[F <: OutputFormat[_, _]](
+    path: String,
+    keyClass: Class[_],
+    valueClass: Class[_],
+    outputFormatClass: Class[F],
+    conf: JobConf) {
+  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
+}
 
-  /** Output the RDD to any Hadoop-supported file system. */
-  def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
-      path: String,
-      keyClass: Class[_],
-      valueClass: Class[_],
-      outputFormatClass: Class[F],
-      conf: Configuration) {
-    rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
-  }
+/** Output the RDD to any Hadoop-supported file system. */
+def saveAsHadoopFile[F <: OutputFormat[_, _]](
+    path: String,
+    keyClass: Class[_],
+    valueClass: Class[_],
+    outputFormatClass: Class[F]) {
+  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass)
+}
 
-  /**
-   * Output the RDD to any Hadoop-supported storage system, using
-   * a Configuration object for that storage system.
-   */
-  def saveAsNewAPIHadoopDataset(conf: Configuration) {
-    rdd.saveAsNewAPIHadoopDataset(conf)
-  }
+/** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */
+def saveAsHadoopFile[F <: OutputFormat[_, _]](
+    path: String,
+    keyClass: Class[_],
+    valueClass: Class[_],
+    outputFormatClass: Class[F],
+    codec: Class[_ <: CompressionCodec]) {
+  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec)
+}
 
-  /** Output the RDD to any Hadoop-supported file system. */
-  def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
-      path: String,
-      keyClass: Class[_],
-      valueClass: Class[_],
-      outputFormatClass: Class[F]) {
-    rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass)
-  }
+/** Output the RDD to any Hadoop-supported file system. */
+def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
+    path: String,
+    keyClass: Class[_],
+    valueClass: Class[_],
+    outputFormatClass: Class[F],
+    conf: Configuration) {
+  rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
+}
 
-  /**
-   * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for
-   * that storage system. The JobConf should set an OutputFormat and any output paths required
-   * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop
-   * MapReduce job.
-   */
-  def saveAsHadoopDataset(conf: JobConf) {
-    rdd.saveAsHadoopDataset(conf)
-  }
+/**
+ * Output the RDD to any Hadoop-supported storage system, using
+ * a Configuration object for that storage system.
+ */
+def saveAsNewAPIHadoopDataset(conf: Configuration) {
+  rdd.saveAsNewAPIHadoopDataset(conf)
+}
 
-  /**
-   * Repartition the RDD according to the given partitioner and, within each resulting partition,
-   * sort records by their keys.
-   *
-   * This is more efficient than calling `repartition` and then sorting within each partition
-   * because it can push the sorting down into the shuffle machinery.
-   */
-  def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = {
-    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-    repartitionAndSortWithinPartitions(partitioner, comp)
-  }
+/** Output the RDD to any Hadoop-supported file system. */
+def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
+    path: String,
+    keyClass: Class[_],
+    valueClass: Class[_],
+    outputFormatClass: Class[F]) {
+  rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass)
+}
 
-  /**
-   * Repartition the RDD according to the given partitioner and, within each resulting partition,
-   * sort records by their keys.
-   *
-   * This is more efficient than calling `repartition` and then sorting within each partition
-   * because it can push the sorting down into the shuffle machinery.
-   */
-  def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K])
-    : JavaPairRDD[K, V] = {
-    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-    fromRDD(
-      new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner))
-  }
+/**
+ * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for
+ * that storage system. The JobConf should set an OutputFormat and any output paths required
+ * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop
+ * MapReduce job.
+ */
+def saveAsHadoopDataset(conf: JobConf) {
+  rdd.saveAsHadoopDataset(conf)
+}
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements in
-   * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an
-   * ordered list of records (in the `save` case, they will be written to multiple `part-X` files
-   * in the filesystem, in order of the keys).
-   */
-  def sortByKey(): JavaPairRDD[K, V] = sortByKey(true)
+/**
+ * Repartition the RDD according to the given partitioner and, within each resulting partition,
+ * sort records by their keys.
+ *
+ * This is more efficient than calling `repartition` and then sorting within each partition
+ * because it can push the sorting down into the shuffle machinery.
+ */
+def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = {
+  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+  repartitionAndSortWithinPartitions(partitioner, comp)
+}
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
-   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
-   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
-   * order of the keys).
-   */
-  def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = {
-    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-    sortByKey(comp, ascending)
-  }
+/**
+ * Repartition the RDD according to the given partitioner and, within each resulting partition,
+ * sort records by their keys.
+ *
+ * This is more efficient than calling `repartition` and then sorting within each partition
+ * because it can push the sorting down into the shuffle machinery.
+ */
+def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K])
+  : JavaPairRDD[K, V] = {
+  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+  fromRDD(
+    new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner))
+}
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
-   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
-   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
-   * order of the keys).
-   */
-  def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
-    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-    sortByKey(comp, ascending, numPartitions)
-  }
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements in
+ * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an
+ * ordered list of records (in the `save` case, they will be written to multiple `part-X` files
+ * in the filesystem, in order of the keys).
+ */
+def sortByKey(): JavaPairRDD[K, V] = sortByKey(true)
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
-   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
-   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
-   * order of the keys).
-   */
-  def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true)
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+ * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+ * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+ * order of the keys).
+ */
+def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = {
+  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+  sortByKey(comp, ascending)
+}
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
-   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
-   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
-   * order of the keys).
-   */
-  def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = {
-    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-    fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
-  }
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+ * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+ * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+ * order of the keys).
+ */
+def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
+  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+  sortByKey(comp, ascending, numPartitions)
+}
 
-  /**
-   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
-   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
-   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
-   * order of the keys).
-   */
-  def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
-    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-    fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
-  }
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+ * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+ * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+ * order of the keys).
+ */
+def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true)
 
-  /**
-   * Return an RDD with the keys of each tuple.
-   */
-  def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1))
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+ * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+ * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+ * order of the keys).
+ */
+def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = {
+  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+  fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
+}
 
-  /**
-   * Return an RDD with the values of each tuple.
-   */
-  def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
+/**
+ * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+ * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+ * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+ * order of the keys).
+ */
+def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
+  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+  fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
+}
 
-  /**
-   * Return approximate number of distinct values for each key in this RDD.
-   *
-   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
-   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
-   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
-   *
-   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
-   *                   It must be greater than 0.000017.
-   * @param partitioner partitioner of the resulting RDD.
-   */
-  def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] =
-  {
-    fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
-  }
+/**
+ * Return an RDD with the keys of each tuple.
+ */
+def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1))
 
-  /**
-   * Return approximate number of distinct values for each key in this RDD.
-   *
-   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
-   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
-   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
-   *
-   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
-   *                   It must be greater than 0.000017.
-   * @param numPartitions number of partitions of the resulting RDD.
-   */
-  def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = {
-    fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions))
-  }
+/**
+ * Return an RDD with the values of each tuple.
+ */
+def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
 
-  /**
-   * Return approximate number of distinct values for each key in this RDD.
-   *
-   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
-   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
-   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
-   *
-   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
-   *                   It must be greater than 0.000017.
-   */
-  def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = {
-    fromRDD(rdd.countApproxDistinctByKey(relativeSD))
-  }
+/**
+ * Return approximate number of distinct values for each key in this RDD.
+ *
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+ *
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+ *                   It must be greater than 0.000017.
+ * @param partitioner partitioner of the resulting RDD.
+ */
+def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] =
+{
+  fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
+}
 
-  /** Assign a name to this RDD */
-  def setName(name: String): JavaPairRDD[K, V] = {
-    rdd.setName(name)
-    this
-  }
+/**
+ * Return approximate number of distinct values for each key in this RDD.
+ *
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+ *
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+ *                   It must be greater than 0.000017.
+ * @param numPartitions number of partitions of the resulting RDD.
+ */
+def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = {
+  fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions))
+}
+
+/**
+ * Return approximate number of distinct values for each key in this RDD.
+ *
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+ *
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+ *                   It must be greater than 0.000017.
+ */
+def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = {
+  fromRDD(rdd.countApproxDistinctByKey(relativeSD))
+}
+
+/** Assign a name to this RDD */
+def setName(name: String): JavaPairRDD[K, V] = {
+  rdd.setName(name)
+  this
+}
 }
 
 object JavaPairRDD {
-  private[spark]
-  def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = {
-    rddToPairRDDFunctions(rdd).mapValues(asJavaIterable)
-  }
+private[spark]
+def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = {
+  rddToPairRDDFunctions(rdd).mapValues(asJavaIterable)
+}
 
-  private[spark]
-  def cogroupResultToJava[K: ClassTag, V, W](
-      rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = {
-    rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2)))
-  }
+private[spark]
+def cogroupResultToJava[K: ClassTag, V, W](
+    rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = {
+  rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2)))
+}
 
-  private[spark]
-  def cogroupResult2ToJava[K: ClassTag, V, W1, W2](
-      rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))])
-      : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = {
-    rddToPairRDDFunctions(rdd)
-      .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3)))
-  }
+private[spark]
+def cogroupResult2ToJava[K: ClassTag, V, W1, W2](
+    rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))])
+    : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = {
+  rddToPairRDDFunctions(rdd)
+    .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3)))
+}
 
-  private[spark]
-  def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3](
-      rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))])
-  : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = {
-    rddToPairRDDFunctions(rdd)
-      .mapValues(x =>
-        (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4)))
-  }
+private[spark]
+def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3](
+    rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))])
+: RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = {
+  rddToPairRDDFunctions(rdd)
+    .mapValues(x =>
+      (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4)))
+}
 
-  def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
-    new JavaPairRDD[K, V](rdd)
-  }
+def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
+  new JavaPairRDD[K, V](rdd)
+}
 
-  implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd
+implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd
 
-  private[spark]
-  implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = {
-    (x: T1, x1: T2) => fun.call(x, x1)
-  }
+private[spark]
+implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = {
+  (x: T1, x1: T2) => fun.call(x, x1)
+}
 
-  private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x)
+private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x)
 
-  private[spark]
-  implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y)
+private[spark]
+implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y)
 
-  /** Convert a JavaRDD of key-value pairs to JavaPairRDD. */
-  def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = {
-    implicit val ctagK: ClassTag[K] = fakeClassTag
-    implicit val ctagV: ClassTag[V] = fakeClassTag
-    new JavaPairRDD[K, V](rdd.rdd)
-  }
+/** Convert a JavaRDD of key-value pairs to JavaPairRDD. */
+def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = {
+  implicit val ctagK: ClassTag[K] = fakeClassTag
+  implicit val ctagV: ClassTag[V] = fakeClassTag
+  new JavaPairRDD[K, V](rdd.rdd)
+}
 
 }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b1cec0f6472b0..acd9b25c4e328 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -21,6 +21,7 @@ import java.io._
 import java.net._
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, UUID, Collections}
 
+import org.apache.commons.lang.ClassUtils
 import org.apache.spark.input.PortableDataStream
 
 import scala.collection.JavaConversions._
@@ -36,7 +37,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, OutputFormat
 import org.apache.spark._
 import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{RDDMultipleTextOutputFormat, RDD}
 import org.apache.spark.util.Utils
 
 private[spark] class PythonRDD(
@@ -678,6 +679,43 @@ private[spark] object PythonRDD extends Logging {
     converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
   }
 
+  /**
+   * Output a Python RDD of key-value pairs to any Hadoop file system such that the values within
+   * the rdd are written to sub-directories organized by the associated key.
+   *
+   * Keys and values are converted to suitable output types using either user specified converters
+   * or, if not specified, [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion
+   * types `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
+   * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
+   * this RDD.
+   */
+  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      path: String,
+      outputFormatClass: String,
+      keyClass: String,
+      valueClass: String,
+      keyConverterClass: String,
+      valueConverterClass: String,
+      confAsMap: java.util.HashMap[String, String],
+      compressionCodecClass: String) = {
+    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
+    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
+      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
+    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
+    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new JavaToWritableConverter)
+
+    converted.saveAsHadoopFile(path,
+      ClassUtils.primitiveToWrapper(kc),
+      ClassUtils.primitiveToWrapper(vc),
+      classOf[RDDMultipleTextOutputFormat[K,V]],
+      new JobConf(mergedConf),
+      codec=codec)
+  }
+
   /**
    * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
    * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output
@@ -749,10 +787,10 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
 
   val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
 
-  /** 
+  /**
    * We try to reuse a single Socket to transfer accumulator updates, as they are all added
    * by the DAGScheduler's single-threaded actor anyway.
-   */ 
+   */
   @transient var socket: Socket = _
 
   def openSocket(): Socket = synchronized {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 0a064e4d86f83..be70f4031aa99 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -876,6 +876,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * /path/prefix/B [/part-1, /part-2, etc]
    * /path/prefix/F [/part-1, /part-2, etc]
    * /path/prefix/N [/part-1, /part-2, etc]
+   *
+   * @param path - The path for the parent directory
+   * @param numPartitions - The number of partitions to partition to
    */
   def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
                                                     (implicit fm: ClassTag[F]) {
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index c63d834f9048b..551438b5b395b 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -17,11 +17,18 @@
 
 package org.apache.spark.api.python
 
-import java.io.{ByteArrayOutputStream, DataOutputStream}
+import java.io.{BufferedReader, ByteArrayOutputStream, DataOutputStream, InputStreamReader}
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.SharedSparkContext
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.rdd.RDDMultipleTextOutputFormat
 import org.scalatest.FunSuite
 
-class PythonRDDSuite extends FunSuite {
+import scala.collection.mutable.HashSet
+
+class PythonRDDSuite extends FunSuite with SharedSparkContext{
 
   test("Writing large strings to the worker") {
     val input: List[String] = List("a"*100000)
@@ -41,4 +48,47 @@ class PythonRDDSuite extends FunSuite {
     PythonRDD.writeIteratorToStream(
       Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer)
   }
+
+  test("saveAsHadoopFileByKey should generate a text file per key") {
+    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
+      Seq(
+        Array(1.toByte,1.toByte),
+        Array(2.toByte,4.toByte),
+        Array(3.toByte,9.toByte),
+        Array(4.toByte,16.toByte),
+        Array(5.toByte,25.toByte))
+    ).toJavaRDD()
+
+    val fs = FileSystem.get(new Configuration())
+    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+    val fullPath = basePath + "/testPath"
+    fs.delete(new Path(fullPath), true)
+
+    PythonRDD.saveAsHadoopFileByKey(
+      testPairs,
+      false, fullPath,
+      classOf[RDDMultipleTextOutputFormat].toString,
+      classOf[Int].toString,
+      classOf[Int].toString,
+      null,
+      null,
+      new java.util.HashMap(), "")
+
+    // Test that a file was created for each key
+    (1 to 5).foreach(key => {
+      val testPath = new Path(fullPath + "/" + key)
+      assert(fs.exists(testPath))
+
+      // Read the file and test that the contents are the values matching that key split by line
+      val input = fs.open(testPath)
+      val reader = new BufferedReader(new InputStreamReader(input))
+      val values = new HashSet[Int]
+      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+      lines.foreach(s => values += s.toInt)
+
+      assert(values.contains(key*key))
+    })
+
+    fs.delete(new Path(fullPath), true)
+  }
 }

From 00e419ce58c45cba3cb5c08c604e4e236ab5a620 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 17 Mar 2015 12:57:10 -0700
Subject: [PATCH 17/25] [SPARK-3533] Added test for JavaPairRDD API

---
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index ec857bef862fa..84a1f8dffb5af 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.rdd
 
 import java.io.{BufferedReader, InputStreamReader}
 
+import org.apache.spark.api.java.JavaPairRDD
+
 import scala.collection.mutable.{ArrayBuffer, HashSet}
 import scala.sys.process._
 import scala.util.Random
@@ -69,6 +71,40 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
     fs.delete(new Path(fullPath), true)
   }
 
+  test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") {
+    val keys = 1 to 20
+    val testValues = 1 to 5
+    // Generate the cartesian product of keys by test values
+    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
+      kv._2.map(v => (kv._1, v*kv._1))
+    })
+
+    val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal))
+
+    val fs = FileSystem.get(new Configuration())
+    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+    val fullPath = basePath + "/testPath"
+    fs.delete(new Path(fullPath), true)
+    pairs.saveAsHadoopFileByKey(fullPath)
+
+    // Test that a file was created for each key
+    keys.foreach(key => {
+      val testPath = new Path(fullPath + "/" + key)
+      assert(fs.exists(testPath))
+
+      // Read the file and test that the contents are the values matching that key split by line
+      val input = fs.open(testPath)
+      val reader = new BufferedReader(new InputStreamReader(input))
+      val values = new HashSet[Int]
+      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+      lines.foreach(s => values += s.toInt)
+
+      testValues.foreach(v => assert(values.contains(v*key)))
+    })
+
+    fs.delete(new Path(fullPath), true)
+  }
+
   test("aggregateByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)
 

From 2ade1184722caf4a99e9ab1e09881e5c5cb68a39 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 17 Mar 2015 14:29:55 -0700
Subject: [PATCH 18/25] reordering to simplify diff

---
 .../apache/spark/api/java/JavaPairRDD.scala   | 398 +++++++++---------
 .../spark/api/python/PythonRDDSuite.scala     |   3 +-
 2 files changed, 201 insertions(+), 200 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index ae54b94c4855c..3ec4b3ad2134a 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -742,6 +742,72 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    */
   def lookup(key: K): JList[V] = seqAsJavaList(rdd.lookup(key))
 
+  /** Output the RDD to any Hadoop-supported file system. */
+  def saveAsHadoopFile[F <: OutputFormat[_, _]](
+      path: String,
+      keyClass: Class[_],
+      valueClass: Class[_],
+      outputFormatClass: Class[F],
+      conf: JobConf) {
+    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
+  }
+
+  /** Output the RDD to any Hadoop-supported file system. */
+  def saveAsHadoopFile[F <: OutputFormat[_, _]](
+      path: String,
+      keyClass: Class[_],
+      valueClass: Class[_],
+      outputFormatClass: Class[F]) {
+    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass)
+  }
+
+  /** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */
+  def saveAsHadoopFile[F <: OutputFormat[_, _]](
+      path: String,
+      keyClass: Class[_],
+      valueClass: Class[_],
+      outputFormatClass: Class[F],
+      codec: Class[_ <: CompressionCodec]) {
+    rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec)
+  }
+
+  /** Output the RDD to any Hadoop-supported file system. */
+  def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
+      path: String,
+      keyClass: Class[_],
+      valueClass: Class[_],
+      outputFormatClass: Class[F],
+      conf: Configuration) {
+    rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
+  }
+
+  /**
+   * Output the RDD to any Hadoop-supported storage system, using
+   * a Configuration object for that storage system.
+   */
+  def saveAsNewAPIHadoopDataset(conf: Configuration) {
+    rdd.saveAsNewAPIHadoopDataset(conf)
+  }
+
+  /** Output the RDD to any Hadoop-supported file system. */
+  def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
+      path: String,
+      keyClass: Class[_],
+      valueClass: Class[_],
+      outputFormatClass: Class[F]) {
+    rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass)
+  }
+
+  /**
+   * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for
+   * that storage system. The JobConf should set an OutputFormat and any output paths required
+   * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop
+   * MapReduce job.
+   */
+  def saveAsHadoopDataset(conf: JobConf) {
+    rdd.saveAsHadoopDataset(conf)
+  }
+
   /*
    * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
    * `OutputFormat` class supporting the key and value types K and V in this RDD.
@@ -768,218 +834,152 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
     rdd.saveAsHadoopFileByKey(path)
   }
 
-/** Output the RDD to any Hadoop-supported file system. */
-def saveAsHadoopFile[F <: OutputFormat[_, _]](
-    path: String,
-    keyClass: Class[_],
-    valueClass: Class[_],
-    outputFormatClass: Class[F],
-    conf: JobConf) {
-  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
-}
-
-/** Output the RDD to any Hadoop-supported file system. */
-def saveAsHadoopFile[F <: OutputFormat[_, _]](
-    path: String,
-    keyClass: Class[_],
-    valueClass: Class[_],
-    outputFormatClass: Class[F]) {
-  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass)
-}
-
-/** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */
-def saveAsHadoopFile[F <: OutputFormat[_, _]](
-    path: String,
-    keyClass: Class[_],
-    valueClass: Class[_],
-    outputFormatClass: Class[F],
-    codec: Class[_ <: CompressionCodec]) {
-  rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec)
-}
-
-/** Output the RDD to any Hadoop-supported file system. */
-def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
-    path: String,
-    keyClass: Class[_],
-    valueClass: Class[_],
-    outputFormatClass: Class[F],
-    conf: Configuration) {
-  rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf)
-}
-
-/**
- * Output the RDD to any Hadoop-supported storage system, using
- * a Configuration object for that storage system.
- */
-def saveAsNewAPIHadoopDataset(conf: Configuration) {
-  rdd.saveAsNewAPIHadoopDataset(conf)
-}
-
-/** Output the RDD to any Hadoop-supported file system. */
-def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
-    path: String,
-    keyClass: Class[_],
-    valueClass: Class[_],
-    outputFormatClass: Class[F]) {
-  rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass)
-}
-
-/**
- * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for
- * that storage system. The JobConf should set an OutputFormat and any output paths required
- * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop
- * MapReduce job.
- */
-def saveAsHadoopDataset(conf: JobConf) {
-  rdd.saveAsHadoopDataset(conf)
-}
-
-/**
- * Repartition the RDD according to the given partitioner and, within each resulting partition,
- * sort records by their keys.
- *
- * This is more efficient than calling `repartition` and then sorting within each partition
- * because it can push the sorting down into the shuffle machinery.
- */
-def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = {
-  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-  repartitionAndSortWithinPartitions(partitioner, comp)
-}
+  /**
+   * Repartition the RDD according to the given partitioner and, within each resulting partition,
+   * sort records by their keys.
+   *
+   * This is more efficient than calling `repartition` and then sorting within each partition
+   * because it can push the sorting down into the shuffle machinery.
+   */
+  def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = {
+    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+    repartitionAndSortWithinPartitions(partitioner, comp)
+  }
 
-/**
- * Repartition the RDD according to the given partitioner and, within each resulting partition,
- * sort records by their keys.
- *
- * This is more efficient than calling `repartition` and then sorting within each partition
- * because it can push the sorting down into the shuffle machinery.
- */
-def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K])
-  : JavaPairRDD[K, V] = {
-  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-  fromRDD(
-    new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner))
-}
+  /**
+   * Repartition the RDD according to the given partitioner and, within each resulting partition,
+   * sort records by their keys.
+   *
+   * This is more efficient than calling `repartition` and then sorting within each partition
+   * because it can push the sorting down into the shuffle machinery.
+   */
+  def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K])
+    : JavaPairRDD[K, V] = {
+    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+    fromRDD(
+      new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner))
+  }
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements in
- * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an
- * ordered list of records (in the `save` case, they will be written to multiple `part-X` files
- * in the filesystem, in order of the keys).
- */
-def sortByKey(): JavaPairRDD[K, V] = sortByKey(true)
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements in
+   * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an
+   * ordered list of records (in the `save` case, they will be written to multiple `part-X` files
+   * in the filesystem, in order of the keys).
+   */
+  def sortByKey(): JavaPairRDD[K, V] = sortByKey(true)
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
- * `collect` or `save` on the resulting RDD will return or output an ordered list of records
- * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
- * order of the keys).
- */
-def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = {
-  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-  sortByKey(comp, ascending)
-}
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = {
+    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+    sortByKey(comp, ascending)
+  }
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
- * `collect` or `save` on the resulting RDD will return or output an ordered list of records
- * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
- * order of the keys).
- */
-def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
-  val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
-  sortByKey(comp, ascending, numPartitions)
-}
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
+    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+    sortByKey(comp, ascending, numPartitions)
+  }
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
- * `collect` or `save` on the resulting RDD will return or output an ordered list of records
- * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
- * order of the keys).
- */
-def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true)
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true)
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
- * `collect` or `save` on the resulting RDD will return or output an ordered list of records
- * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
- * order of the keys).
- */
-def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = {
-  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-  fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
-}
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = {
+    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+    fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
+  }
 
-/**
- * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
- * `collect` or `save` on the resulting RDD will return or output an ordered list of records
- * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
- * order of the keys).
- */
-def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
-  implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
-  fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
-}
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
+    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
+    fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
+  }
 
-/**
- * Return an RDD with the keys of each tuple.
- */
-def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1))
+  /**
+   * Return an RDD with the keys of each tuple.
+   */
+  def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1))
 
-/**
- * Return an RDD with the values of each tuple.
- */
-def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
+  /**
+   * Return an RDD with the values of each tuple.
+   */
+  def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
 
-/**
- * Return approximate number of distinct values for each key in this RDD.
- *
- * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
- * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
- * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
- *
- * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
- *                   It must be greater than 0.000017.
- * @param partitioner partitioner of the resulting RDD.
- */
-def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] =
-{
-  fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
-}
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   *
+   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+   *
+   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+   *                   It must be greater than 0.000017.
+   * @param partitioner partitioner of the resulting RDD.
+   */
+  def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] =
+  {
+    fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
+  }
 
-/**
- * Return approximate number of distinct values for each key in this RDD.
- *
- * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
- * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
- * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
- *
- * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
- *                   It must be greater than 0.000017.
- * @param numPartitions number of partitions of the resulting RDD.
- */
-def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = {
-  fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions))
-}
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   *
+   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+   *
+   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+   *                   It must be greater than 0.000017.
+   * @param numPartitions number of partitions of the resulting RDD.
+   */
+  def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = {
+    fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions))
+  }
 
-/**
- * Return approximate number of distinct values for each key in this RDD.
- *
- * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
- * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
- * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
- *
- * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
- *                   It must be greater than 0.000017.
- */
-def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = {
-  fromRDD(rdd.countApproxDistinctByKey(relativeSD))
-}
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   *
+   * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
+   * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
+   * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+   *
+   * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
+   *                   It must be greater than 0.000017.
+   */
+  def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = {
+    fromRDD(rdd.countApproxDistinctByKey(relativeSD))
+  }
 
-/** Assign a name to this RDD */
-def setName(name: String): JavaPairRDD[K, V] = {
-  rdd.setName(name)
-  this
-}
+  /** Assign a name to this RDD */
+  def setName(name: String): JavaPairRDD[K, V] = {
+    rdd.setName(name)
+    this
+  }
 }
 
 object JavaPairRDD {
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index 551438b5b395b..f536c7f1ca70c 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -66,7 +66,8 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{
 
     PythonRDD.saveAsHadoopFileByKey(
       testPairs,
-      false, fullPath,
+      false,
+      fullPath,
       classOf[RDDMultipleTextOutputFormat].toString,
       classOf[Int].toString,
       classOf[Int].toString,

From 89215625eba2970da213efa9c6d98891f9b890c8 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 17 Mar 2015 14:31:10 -0700
Subject: [PATCH 19/25] spacing fix

---
 .../apache/spark/api/java/JavaPairRDD.scala   | 82 +++++++++----------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 3ec4b3ad2134a..e8bd18c57d2c5 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -983,55 +983,55 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
 }
 
 object JavaPairRDD {
-private[spark]
-def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = {
-  rddToPairRDDFunctions(rdd).mapValues(asJavaIterable)
-}
+  private[spark]
+  def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = {
+    rddToPairRDDFunctions(rdd).mapValues(asJavaIterable)
+  }
 
-private[spark]
-def cogroupResultToJava[K: ClassTag, V, W](
-    rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = {
-  rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2)))
-}
+  private[spark]
+  def cogroupResultToJava[K: ClassTag, V, W](
+      rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = {
+    rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2)))
+  }
 
-private[spark]
-def cogroupResult2ToJava[K: ClassTag, V, W1, W2](
-    rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))])
-    : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = {
-  rddToPairRDDFunctions(rdd)
-    .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3)))
-}
+  private[spark]
+  def cogroupResult2ToJava[K: ClassTag, V, W1, W2](
+      rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))])
+      : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = {
+    rddToPairRDDFunctions(rdd)
+      .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3)))
+  }
 
-private[spark]
-def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3](
-    rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))])
-: RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = {
-  rddToPairRDDFunctions(rdd)
-    .mapValues(x =>
-      (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4)))
-}
+  private[spark]
+  def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3](
+      rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))])
+  : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = {
+    rddToPairRDDFunctions(rdd)
+      .mapValues(x =>
+        (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4)))
+  }
 
-def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
-  new JavaPairRDD[K, V](rdd)
-}
+  def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
+    new JavaPairRDD[K, V](rdd)
+  }
 
-implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd
+  implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd
 
-private[spark]
-implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = {
-  (x: T1, x1: T2) => fun.call(x, x1)
-}
+  private[spark]
+  implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = {
+    (x: T1, x1: T2) => fun.call(x, x1)
+  }
 
-private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x)
+  private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x)
 
-private[spark]
-implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y)
+  private[spark]
+  implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y)
 
-/** Convert a JavaRDD of key-value pairs to JavaPairRDD. */
-def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = {
-  implicit val ctagK: ClassTag[K] = fakeClassTag
-  implicit val ctagV: ClassTag[V] = fakeClassTag
-  new JavaPairRDD[K, V](rdd.rdd)
-}
+  /** Convert a JavaRDD of key-value pairs to JavaPairRDD. */
+  def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = {
+    implicit val ctagK: ClassTag[K] = fakeClassTag
+    implicit val ctagV: ClassTag[V] = fakeClassTag
+    new JavaPairRDD[K, V](rdd.rdd)
+  }
 
 }

From 1ed32184e03188292c0d0587dad0669b61eeba51 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Tue, 17 Mar 2015 14:35:22 -0700
Subject: [PATCH 20/25] [SPARK-3533]Input ordering

---
 .../scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 84a1f8dffb5af..f2035e0ad4bca 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -19,8 +19,6 @@ package org.apache.spark.rdd
 
 import java.io.{BufferedReader, InputStreamReader}
 
-import org.apache.spark.api.java.JavaPairRDD
-
 import scala.collection.mutable.{ArrayBuffer, HashSet}
 import scala.sys.process._
 import scala.util.Random
@@ -31,9 +29,11 @@ import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter,
 OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter,
 TaskAttemptContext => NewTaskAttempContext}
+import org.apache.hadoop.util.Progressable
+
+import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.{Partitioner, SharedSparkContext}
 import org.apache.spark.util.Utils
-import org.apache.hadoop.util.Progressable
 
 import org.scalatest.FunSuite
 

From df6d89c746d947367bb2bc68471cf41fc5021096 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 18 Mar 2015 06:40:18 -0700
Subject: [PATCH 21/25] Attempting to fix build

---
 .../apache/spark/api/java/JavaPairRDD.scala   | 24 ++---
 .../apache/spark/api/python/PythonRDD.scala   | 54 +++++------
 .../spark/api/python/PythonRDDSuite.scala     | 90 +++++++++----------
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 66 +++++++-------
 4 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 65637a63bfd3f..6d37fd5ab8061 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -821,18 +821,18 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * @param path - The path for the parent directory
    * @param numPartitions - The number of partitions to partition to
    */
-  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
-      (implicit fm: ClassTag[F]) {
-    rdd.saveAsHadoopFileByKey(path, numPartitions)
-  }
-
-  /*
-   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
-   * `OutputFormat` class supporting the key and value types K and V in this RDD.
-   */
-  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
-    rdd.saveAsHadoopFileByKey(path)
-  }
+//  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
+//      (implicit fm: ClassTag[F]) {
+//    rdd.saveAsHadoopFileByKey(path, numPartitions)
+//  }
+//
+//  /*
+//   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
+//   * `OutputFormat` class supporting the key and value types K and V in this RDD.
+//   */
+//  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
+//    rdd.saveAsHadoopFileByKey(path)
+//  }
 
   /**
    * Repartition the RDD according to the given partitioner and, within each resulting partition,
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index ea3faba240321..8a153d49f9600 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
-import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID}
+import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
@@ -734,32 +734,32 @@ private[spark] object PythonRDD extends Logging {
    * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
    * this RDD.
    */
-  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
-      pyRDD: JavaRDD[Array[Byte]],
-      batchSerialized: Boolean,
-      path: String,
-      outputFormatClass: String,
-      keyClass: String,
-      valueClass: String,
-      keyConverterClass: String,
-      valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String],
-      compressionCodecClass: String) = {
-    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
-    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
-      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
-    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
-    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
-    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
-      new JavaToWritableConverter)
-
-    converted.saveAsHadoopFile(path,
-      ClassUtils.primitiveToWrapper(kc),
-      ClassUtils.primitiveToWrapper(vc),
-      classOf[RDDMultipleTextOutputFormat[K,V]],
-      new JobConf(mergedConf),
-      codec=codec)
-  }
+//  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
+//      pyRDD: JavaRDD[Array[Byte]],
+//      batchSerialized: Boolean,
+//      path: String,
+//      outputFormatClass: String,
+//      keyClass: String,
+//      valueClass: String,
+//      keyConverterClass: String,
+//      valueConverterClass: String,
+//      confAsMap: java.util.HashMap[String, String],
+//      compressionCodecClass: String) = {
+//    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
+//    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
+//      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
+//    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
+//    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
+//    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+//      new JavaToWritableConverter)
+//
+//    converted.saveAsHadoopFile(path,
+//      ClassUtils.primitiveToWrapper(kc),
+//      ClassUtils.primitiveToWrapper(vc),
+//      classOf[RDDMultipleTextOutputFormat[K,V]],
+//      new JobConf(mergedConf),
+//      codec=codec)
+//  }
 
   /**
    * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index f536c7f1ca70c..14f5f00f22f2e 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -28,7 +28,7 @@ import org.scalatest.FunSuite
 
 import scala.collection.mutable.HashSet
 
-class PythonRDDSuite extends FunSuite with SharedSparkContext{
+class PythonRDDSuite extends FunSuite { //with SharedSparkContext{
 
   test("Writing large strings to the worker") {
     val input: List[String] = List("a"*100000)
@@ -48,48 +48,48 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{
     PythonRDD.writeIteratorToStream(
       Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer)
   }
-
-  test("saveAsHadoopFileByKey should generate a text file per key") {
-    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
-      Seq(
-        Array(1.toByte,1.toByte),
-        Array(2.toByte,4.toByte),
-        Array(3.toByte,9.toByte),
-        Array(4.toByte,16.toByte),
-        Array(5.toByte,25.toByte))
-    ).toJavaRDD()
-
-    val fs = FileSystem.get(new Configuration())
-    val basePath = sc.conf.get("spark.local.dir", "/tmp")
-    val fullPath = basePath + "/testPath"
-    fs.delete(new Path(fullPath), true)
-
-    PythonRDD.saveAsHadoopFileByKey(
-      testPairs,
-      false,
-      fullPath,
-      classOf[RDDMultipleTextOutputFormat].toString,
-      classOf[Int].toString,
-      classOf[Int].toString,
-      null,
-      null,
-      new java.util.HashMap(), "")
-
-    // Test that a file was created for each key
-    (1 to 5).foreach(key => {
-      val testPath = new Path(fullPath + "/" + key)
-      assert(fs.exists(testPath))
-
-      // Read the file and test that the contents are the values matching that key split by line
-      val input = fs.open(testPath)
-      val reader = new BufferedReader(new InputStreamReader(input))
-      val values = new HashSet[Int]
-      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
-      lines.foreach(s => values += s.toInt)
-
-      assert(values.contains(key*key))
-    })
-
-    fs.delete(new Path(fullPath), true)
-  }
+//
+//  test("saveAsHadoopFileByKey should generate a text file per key") {
+//    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
+//      Seq(
+//        Array(1.toByte,1.toByte),
+//        Array(2.toByte,4.toByte),
+//        Array(3.toByte,9.toByte),
+//        Array(4.toByte,16.toByte),
+//        Array(5.toByte,25.toByte))
+//    ).toJavaRDD()
+//
+//    val fs = FileSystem.get(new Configuration())
+//    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+//    val fullPath = basePath + "/testPath"
+//    fs.delete(new Path(fullPath), true)
+//
+//    PythonRDD.saveAsHadoopFileByKey(
+//      testPairs,
+//      false,
+//      fullPath,
+//      classOf[RDDMultipleTextOutputFormat].toString,
+//      classOf[Int].toString,
+//      classOf[Int].toString,
+//      null,
+//      null,
+//      new java.util.HashMap(), "")
+//
+//    // Test that a file was created for each key
+//    (1 to 5).foreach(key => {
+//      val testPath = new Path(fullPath + "/" + key)
+//      assert(fs.exists(testPath))
+//
+//      // Read the file and test that the contents are the values matching that key split by line
+//      val input = fs.open(testPath)
+//      val reader = new BufferedReader(new InputStreamReader(input))
+//      val values = new HashSet[Int]
+//      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+//      lines.foreach(s => values += s.toInt)
+//
+//      assert(values.contains(key*key))
+//    })
+//
+//    fs.delete(new Path(fullPath), true)
+//  }
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index f2035e0ad4bca..e4d00b2ea3181 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -71,39 +71,39 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
     fs.delete(new Path(fullPath), true)
   }
 
-  test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") {
-    val keys = 1 to 20
-    val testValues = 1 to 5
-    // Generate the cartesian product of keys by test values
-    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
-      kv._2.map(v => (kv._1, v*kv._1))
-    })
-
-    val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal))
-
-    val fs = FileSystem.get(new Configuration())
-    val basePath = sc.conf.get("spark.local.dir", "/tmp")
-    val fullPath = basePath + "/testPath"
-    fs.delete(new Path(fullPath), true)
-    pairs.saveAsHadoopFileByKey(fullPath)
-
-    // Test that a file was created for each key
-    keys.foreach(key => {
-      val testPath = new Path(fullPath + "/" + key)
-      assert(fs.exists(testPath))
-
-      // Read the file and test that the contents are the values matching that key split by line
-      val input = fs.open(testPath)
-      val reader = new BufferedReader(new InputStreamReader(input))
-      val values = new HashSet[Int]
-      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
-      lines.foreach(s => values += s.toInt)
-
-      testValues.foreach(v => assert(values.contains(v*key)))
-    })
-
-    fs.delete(new Path(fullPath), true)
-  }
+//  test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") {
+//    val keys = 1 to 20
+//    val testValues = 1 to 5
+//    // Generate the cartesian product of keys by test values
+//    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
+//      kv._2.map(v => (kv._1, v*kv._1))
+//    })
+//
+//    val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal))
+//
+//    val fs = FileSystem.get(new Configuration())
+//    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+//    val fullPath = basePath + "/testPath"
+//    fs.delete(new Path(fullPath), true)
+//    pairs.saveAsHadoopFileByKey(fullPath)
+//
+//    // Test that a file was created for each key
+//    keys.foreach(key => {
+//      val testPath = new Path(fullPath + "/" + key)
+//      assert(fs.exists(testPath))
+//
+//      // Read the file and test that the contents are the values matching that key split by line
+//      val input = fs.open(testPath)
+//      val reader = new BufferedReader(new InputStreamReader(input))
+//      val values = new HashSet[Int]
+//      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+//      lines.foreach(s => values += s.toInt)
+//
+//      testValues.foreach(v => assert(values.contains(v*key)))
+//    })
+//
+//    fs.delete(new Path(fullPath), true)
+//  }
 
   test("aggregateByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)

From f185ec3b305308f967795161cf89813e12ddc723 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 18 Mar 2015 06:52:48 -0700
Subject: [PATCH 22/25] Revert "Attempting to fix build"

This reverts commit df6d89c746d947367bb2bc68471cf41fc5021096.
---
 .../apache/spark/api/java/JavaPairRDD.scala   | 24 ++---
 .../apache/spark/api/python/PythonRDD.scala   | 54 +++++------
 .../spark/api/python/PythonRDDSuite.scala     | 90 +++++++++----------
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 66 +++++++-------
 4 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 6d37fd5ab8061..65637a63bfd3f 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -821,18 +821,18 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * @param path - The path for the parent directory
    * @param numPartitions - The number of partitions to partition to
    */
-//  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
-//      (implicit fm: ClassTag[F]) {
-//    rdd.saveAsHadoopFileByKey(path, numPartitions)
-//  }
-//
-//  /*
-//   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
-//   * `OutputFormat` class supporting the key and value types K and V in this RDD.
-//   */
-//  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
-//    rdd.saveAsHadoopFileByKey(path)
-//  }
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int)
+      (implicit fm: ClassTag[F]) {
+    rdd.saveAsHadoopFileByKey(path, numPartitions)
+  }
+
+  /*
+   * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop
+   * `OutputFormat` class supporting the key and value types K and V in this RDD.
+   */
+  def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) {
+    rdd.saveAsHadoopFileByKey(path)
+  }
 
   /**
    * Repartition the RDD according to the given partitioner and, within each resulting partition,
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 8a153d49f9600..ea3faba240321 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
-import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap}
+import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
@@ -734,32 +734,32 @@ private[spark] object PythonRDD extends Logging {
    * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
    * this RDD.
    */
-//  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
-//      pyRDD: JavaRDD[Array[Byte]],
-//      batchSerialized: Boolean,
-//      path: String,
-//      outputFormatClass: String,
-//      keyClass: String,
-//      valueClass: String,
-//      keyConverterClass: String,
-//      valueConverterClass: String,
-//      confAsMap: java.util.HashMap[String, String],
-//      compressionCodecClass: String) = {
-//    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
-//    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
-//      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
-//    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
-//    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
-//    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
-//      new JavaToWritableConverter)
-//
-//    converted.saveAsHadoopFile(path,
-//      ClassUtils.primitiveToWrapper(kc),
-//      ClassUtils.primitiveToWrapper(vc),
-//      classOf[RDDMultipleTextOutputFormat[K,V]],
-//      new JobConf(mergedConf),
-//      codec=codec)
-//  }
+  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      path: String,
+      outputFormatClass: String,
+      keyClass: String,
+      valueClass: String,
+      keyConverterClass: String,
+      valueConverterClass: String,
+      confAsMap: java.util.HashMap[String, String],
+      compressionCodecClass: String) = {
+    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
+    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
+      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
+    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
+    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new JavaToWritableConverter)
+
+    converted.saveAsHadoopFile(path,
+      ClassUtils.primitiveToWrapper(kc),
+      ClassUtils.primitiveToWrapper(vc),
+      classOf[RDDMultipleTextOutputFormat[K,V]],
+      new JobConf(mergedConf),
+      codec=codec)
+  }
 
   /**
    * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index 14f5f00f22f2e..f536c7f1ca70c 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -28,7 +28,7 @@ import org.scalatest.FunSuite
 
 import scala.collection.mutable.HashSet
 
-class PythonRDDSuite extends FunSuite { //with SharedSparkContext{
+class PythonRDDSuite extends FunSuite with SharedSparkContext{
 
   test("Writing large strings to the worker") {
     val input: List[String] = List("a"*100000)
@@ -48,48 +48,48 @@ class PythonRDDSuite extends FunSuite { //with SharedSparkContext{
     PythonRDD.writeIteratorToStream(
       Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer)
   }
-//
-//  test("saveAsHadoopFileByKey should generate a text file per key") {
-//    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
-//      Seq(
-//        Array(1.toByte,1.toByte),
-//        Array(2.toByte,4.toByte),
-//        Array(3.toByte,9.toByte),
-//        Array(4.toByte,16.toByte),
-//        Array(5.toByte,25.toByte))
-//    ).toJavaRDD()
-//
-//    val fs = FileSystem.get(new Configuration())
-//    val basePath = sc.conf.get("spark.local.dir", "/tmp")
-//    val fullPath = basePath + "/testPath"
-//    fs.delete(new Path(fullPath), true)
-//
-//    PythonRDD.saveAsHadoopFileByKey(
-//      testPairs,
-//      false,
-//      fullPath,
-//      classOf[RDDMultipleTextOutputFormat].toString,
-//      classOf[Int].toString,
-//      classOf[Int].toString,
-//      null,
-//      null,
-//      new java.util.HashMap(), "")
-//
-//    // Test that a file was created for each key
-//    (1 to 5).foreach(key => {
-//      val testPath = new Path(fullPath + "/" + key)
-//      assert(fs.exists(testPath))
-//
-//      // Read the file and test that the contents are the values matching that key split by line
-//      val input = fs.open(testPath)
-//      val reader = new BufferedReader(new InputStreamReader(input))
-//      val values = new HashSet[Int]
-//      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
-//      lines.foreach(s => values += s.toInt)
-//
-//      assert(values.contains(key*key))
-//    })
-//
-//    fs.delete(new Path(fullPath), true)
-//  }
+
+  test("saveAsHadoopFileByKey should generate a text file per key") {
+    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
+      Seq(
+        Array(1.toByte,1.toByte),
+        Array(2.toByte,4.toByte),
+        Array(3.toByte,9.toByte),
+        Array(4.toByte,16.toByte),
+        Array(5.toByte,25.toByte))
+    ).toJavaRDD()
+
+    val fs = FileSystem.get(new Configuration())
+    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+    val fullPath = basePath + "/testPath"
+    fs.delete(new Path(fullPath), true)
+
+    PythonRDD.saveAsHadoopFileByKey(
+      testPairs,
+      false,
+      fullPath,
+      classOf[RDDMultipleTextOutputFormat].toString,
+      classOf[Int].toString,
+      classOf[Int].toString,
+      null,
+      null,
+      new java.util.HashMap(), "")
+
+    // Test that a file was created for each key
+    (1 to 5).foreach(key => {
+      val testPath = new Path(fullPath + "/" + key)
+      assert(fs.exists(testPath))
+
+      // Read the file and test that the contents are the values matching that key split by line
+      val input = fs.open(testPath)
+      val reader = new BufferedReader(new InputStreamReader(input))
+      val values = new HashSet[Int]
+      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+      lines.foreach(s => values += s.toInt)
+
+      assert(values.contains(key*key))
+    })
+
+    fs.delete(new Path(fullPath), true)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index e4d00b2ea3181..f2035e0ad4bca 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -71,39 +71,39 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
     fs.delete(new Path(fullPath), true)
   }
 
-//  test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") {
-//    val keys = 1 to 20
-//    val testValues = 1 to 5
-//    // Generate the cartesian product of keys by test values
-//    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
-//      kv._2.map(v => (kv._1, v*kv._1))
-//    })
-//
-//    val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal))
-//
-//    val fs = FileSystem.get(new Configuration())
-//    val basePath = sc.conf.get("spark.local.dir", "/tmp")
-//    val fullPath = basePath + "/testPath"
-//    fs.delete(new Path(fullPath), true)
-//    pairs.saveAsHadoopFileByKey(fullPath)
-//
-//    // Test that a file was created for each key
-//    keys.foreach(key => {
-//      val testPath = new Path(fullPath + "/" + key)
-//      assert(fs.exists(testPath))
-//
-//      // Read the file and test that the contents are the values matching that key split by line
-//      val input = fs.open(testPath)
-//      val reader = new BufferedReader(new InputStreamReader(input))
-//      val values = new HashSet[Int]
-//      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
-//      lines.foreach(s => values += s.toInt)
-//
-//      testValues.foreach(v => assert(values.contains(v*key)))
-//    })
-//
-//    fs.delete(new Path(fullPath), true)
-//  }
+  test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") {
+    val keys = 1 to 20
+    val testValues = 1 to 5
+    // Generate the cartesian product of keys by test values
+    val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => {
+      kv._2.map(v => (kv._1, v*kv._1))
+    })
+
+    val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal))
+
+    val fs = FileSystem.get(new Configuration())
+    val basePath = sc.conf.get("spark.local.dir", "/tmp")
+    val fullPath = basePath + "/testPath"
+    fs.delete(new Path(fullPath), true)
+    pairs.saveAsHadoopFileByKey(fullPath)
+
+    // Test that a file was created for each key
+    keys.foreach(key => {
+      val testPath = new Path(fullPath + "/" + key)
+      assert(fs.exists(testPath))
+
+      // Read the file and test that the contents are the values matching that key split by line
+      val input = fs.open(testPath)
+      val reader = new BufferedReader(new InputStreamReader(input))
+      val values = new HashSet[Int]
+      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
+      lines.foreach(s => values += s.toInt)
+
+      testValues.foreach(v => assert(values.contains(v*key)))
+    })
+
+    fs.delete(new Path(fullPath), true)
+  }
 
   test("aggregateByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)

From c04efeae2728f663a3848be4a526de480e121185 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 18 Mar 2015 06:55:12 -0700
Subject: [PATCH 23/25] Fixing compilation error

---
 .../test/scala/org/apache/spark/api/python/PythonRDDSuite.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index f536c7f1ca70c..7024bbc89d803 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -68,7 +68,7 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{
       testPairs,
       false,
       fullPath,
-      classOf[RDDMultipleTextOutputFormat].toString,
+      classOf[RDDMultipleTextOutputFormat[Int, Int]].toString,
       classOf[Int].toString,
       classOf[Int].toString,
       null,

From 56c2d39c97e86b691758b0490f7a019b0e396983 Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 18 Mar 2015 10:32:51 -0700
Subject: [PATCH 24/25] Removing python test and wrapper

---
 .../apache/spark/api/python/PythonRDD.scala   | 42 +--------------
 .../spark/api/python/PythonRDDSuite.scala     | 52 +------------------
 2 files changed, 3 insertions(+), 91 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index ea3faba240321..f43778498a805 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,14 +19,13 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
-import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID}
+import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.language.existentials
 
 import com.google.common.base.Charsets.UTF_8
-import org.apache.commons.lang.ClassUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{InputFormat, JobConf, OutputFormat}
@@ -36,7 +35,7 @@ import org.apache.spark._
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.input.PortableDataStream
-import org.apache.spark.rdd.{RDD, RDDMultipleTextOutputFormat}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 import scala.util.control.NonFatal
@@ -724,43 +723,6 @@ private[spark] object PythonRDD extends Logging {
     converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
   }
 
-  /**
-   * Output a Python RDD of key-value pairs to any Hadoop file system such that the values within
-   * the rdd are written to sub-directories organized by the associated key.
-   *
-   * Keys and values are converted to suitable output types using either user specified converters
-   * or, if not specified, [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion
-   * types `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
-   * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
-   * this RDD.
-   */
-  def saveAsHadoopFileByKey[K, V, C <: CompressionCodec](
-      pyRDD: JavaRDD[Array[Byte]],
-      batchSerialized: Boolean,
-      path: String,
-      outputFormatClass: String,
-      keyClass: String,
-      valueClass: String,
-      keyConverterClass: String,
-      valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String],
-      compressionCodecClass: String) = {
-    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
-    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
-      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
-    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
-    val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]])
-    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
-      new JavaToWritableConverter)
-
-    converted.saveAsHadoopFile(path,
-      ClassUtils.primitiveToWrapper(kc),
-      ClassUtils.primitiveToWrapper(vc),
-      classOf[RDDMultipleTextOutputFormat[K,V]],
-      new JobConf(mergedConf),
-      codec=codec)
-  }
-
   /**
    * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
    * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index 7024bbc89d803..0b9ffd58cb5b9 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -17,17 +17,11 @@
 
 package org.apache.spark.api.python
 
-import java.io.{BufferedReader, ByteArrayOutputStream, DataOutputStream, InputStreamReader}
+import java.io.{ByteArrayOutputStream, DataOutputStream}
 
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SharedSparkContext
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.rdd.RDDMultipleTextOutputFormat
 import org.scalatest.FunSuite
 
-import scala.collection.mutable.HashSet
-
 class PythonRDDSuite extends FunSuite with SharedSparkContext{
 
   test("Writing large strings to the worker") {
@@ -48,48 +42,4 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{
     PythonRDD.writeIteratorToStream(
       Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer)
   }
-
-  test("saveAsHadoopFileByKey should generate a text file per key") {
-    val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
-      Seq(
-        Array(1.toByte,1.toByte),
-        Array(2.toByte,4.toByte),
-        Array(3.toByte,9.toByte),
-        Array(4.toByte,16.toByte),
-        Array(5.toByte,25.toByte))
-    ).toJavaRDD()
-
-    val fs = FileSystem.get(new Configuration())
-    val basePath = sc.conf.get("spark.local.dir", "/tmp")
-    val fullPath = basePath + "/testPath"
-    fs.delete(new Path(fullPath), true)
-
-    PythonRDD.saveAsHadoopFileByKey(
-      testPairs,
-      false,
-      fullPath,
-      classOf[RDDMultipleTextOutputFormat[Int, Int]].toString,
-      classOf[Int].toString,
-      classOf[Int].toString,
-      null,
-      null,
-      new java.util.HashMap(), "")
-
-    // Test that a file was created for each key
-    (1 to 5).foreach(key => {
-      val testPath = new Path(fullPath + "/" + key)
-      assert(fs.exists(testPath))
-
-      // Read the file and test that the contents are the values matching that key split by line
-      val input = fs.open(testPath)
-      val reader = new BufferedReader(new InputStreamReader(input))
-      val values = new HashSet[Int]
-      val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
-      lines.foreach(s => values += s.toInt)
-
-      assert(values.contains(key*key))
-    })
-
-    fs.delete(new Path(fullPath), true)
-  }
 }

From c442665a9ea62de0c96594c3af3d5d56dd025f2e Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 18 Mar 2015 10:36:01 -0700
Subject: [PATCH 25/25] Removing more python changes

---
 core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala | 2 +-
 .../test/scala/org/apache/spark/api/python/PythonRDDSuite.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index f43778498a805..b885bd42916bb 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -797,7 +797,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
   /**
    * We try to reuse a single Socket to transfer accumulator updates, as they are all added
    * by the DAGScheduler's single-threaded actor anyway.
-   */
+   */ 
   @transient var socket: Socket = _
 
   def openSocket(): Socket = synchronized {
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
index 0b9ffd58cb5b9..b7c7195e0ca9f 100644
--- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -22,7 +22,7 @@ import java.io.{ByteArrayOutputStream, DataOutputStream}
 import org.apache.spark.SharedSparkContext
 import org.scalatest.FunSuite
 
-class PythonRDDSuite extends FunSuite with SharedSparkContext{
+class PythonRDDSuite extends FunSuite {
 
   test("Writing large strings to the worker") {
     val input: List[String] = List("a"*100000)