From b56b950f6b4e88e590706f48ec32b01d00bf29d7 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Mon, 15 Dec 2014 10:50:01 -0800 Subject: [PATCH 01/25] Initial stub --- .../apache/spark/rdd/PairRDDFunctions.scala | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index c43e1f2fe135e..6f3a18f4089c0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -21,6 +21,8 @@ import java.nio.ByteBuffer import java.text.SimpleDateFormat import java.util.{Date, HashMap => JHashMap} +import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat + import scala.collection.{Map, mutable} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer @@ -853,6 +855,30 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) } + class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] { + override def generateActualKey(key: Any, value: Any): Any = + NullWritable.get() + + override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = + key.asInstanceOf[String] + } + + /** + * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop + * `OutputFormat` class supporting the key and value types K and V in this RDD. + * + * Example: + * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')] + * /path/prefix/B [/part-1, /part-2, etc] + * /path/prefix/F [/part-1, /part-2, etc] + * /path/prefix/N [/part-1, /part-2, etc] + */ + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { + val paths = this.keys.map("path/" + "key_" + _.toString) + + saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) + } + /** * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class * supporting the key and value types K and V in this RDD. Compress the result with the From 757ab3ded6892a840e925c4c59c5061be091023d Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Sun, 28 Dec 2014 11:35:33 -0500 Subject: [PATCH 02/25] Updating tests --- .../org/apache/spark/rdd/PairRDDFunctions.scala | 12 ++++++++---- .../apache/spark/rdd/PairRDDFunctionsSuite.scala | 15 +++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 6f3a18f4089c0..b62c8df3882e5 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -21,6 +21,7 @@ import java.nio.ByteBuffer import java.text.SimpleDateFormat import java.util.{Date, HashMap => JHashMap} +import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat import scala.collection.{Map, mutable} @@ -873,12 +874,15 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * /path/prefix/F [/part-1, /part-2, etc] * /path/prefix/N [/part-1, /part-2, etc] */ - def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { - val paths = this.keys.map("path/" + "key_" + _.toString) - - saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) + (implicit fm: ClassTag[F]) { + partitionBy(new HashPartitioner(numPartitions)). + saveAsHadoopFileByKey(path) } + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { + saveAsHadoopFile(path, keyClass, valueClass, classOf[RDDMultipleTextOutputFormat]) + } /** * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class * supporting the key and value types K and V in this RDD. Compress the result with the diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 108f70af43f37..dfbca76f490e4 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -17,23 +17,30 @@ package org.apache.spark.rdd -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.mapred._ -import org.apache.hadoop.util.Progressable - import scala.collection.mutable.{ArrayBuffer, HashSet} +import scala.sys.process._ import scala.util.Random import org.apache.hadoop.conf.{Configurable, Configuration} +import org.apache.hadoop.fs.{Path, FileSystem} +import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext} import org.apache.spark.{Partitioner, SharedSparkContext} import org.apache.spark.util.Utils +import org.apache.hadoop.util.Progressable import org.scalatest.FunSuite class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { + test("saveAsHadoopFileByKey should generate a text file per key") { + val pairs = sc.parallelize((1 to 20).zipWithIndex) + val conf = new JobConf() + + pairs.saveAsHadoopFileByKey("testPath") + } + test("aggregateByKey") { val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) From 37132457688935c2791c9c9701a7b5af4ff91dac Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Mon, 5 Jan 2015 16:17:31 -0500 Subject: [PATCH 03/25] Test still failing during reflection processing in hadoop utils --- .../org/apache/spark/rdd/PairRDDFunctions.scala | 17 ++++++++++++++++- .../spark/rdd/PairRDDFunctionsSuite.scala | 9 ++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 6ae6d38b51048..399cccad2440e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -21,6 +21,7 @@ import java.nio.ByteBuffer import java.text.SimpleDateFormat import java.util.{Date, HashMap => JHashMap} +import org.apache.commons.lang.ClassUtils import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat @@ -860,14 +861,24 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) } class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] { + def init() = { + println("Initializing multiple text output format saver") + } + override def generateActualKey(key: Any, value: Any): Any = + { NullWritable.get() + } override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = + { key.asInstanceOf[String] + } } /** + * TODO: This only works if the key is a java Object (can't work with primitive types) + * * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop * `OutputFormat` class supporting the key and value types K and V in this RDD. * @@ -879,13 +890,17 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) */ def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) (implicit fm: ClassTag[F]) { + partitionBy(new HashPartitioner(numPartitions)). saveAsHadoopFileByKey(path) } def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { - saveAsHadoopFile(path, keyClass, valueClass, classOf[RDDMultipleTextOutputFormat]) + saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), + ClassUtils.primitiveToWrapper(valueClass), + classOf[RDDMultipleTextOutputFormat]) } + /** * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class * supporting the key and value types K and V in this RDD. Compress the result with the diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index dfbca76f490e4..0568c475a9171 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -36,9 +36,12 @@ import org.scalatest.FunSuite class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { test("saveAsHadoopFileByKey should generate a text file per key") { val pairs = sc.parallelize((1 to 20).zipWithIndex) - val conf = new JobConf() - - pairs.saveAsHadoopFileByKey("testPath") + val fs = FileSystem.get(new Configuration()) + val basePath = sc.conf.get("spark.local.dir", "/tmp") + val fullPath = basePath + "testPath" + fs.delete(new Path(fullPath), true) + pairs.saveAsHadoopFileByKey(fullPath) + fs.delete(new Path(fullPath), true) } test("aggregateByKey") { From 5e615a2b734abbe74bac6a8ef83335c0c392ab41 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 6 Jan 2015 11:09:35 -0500 Subject: [PATCH 04/25] Added init function to try to resolve reflection error --- .../src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 399cccad2440e..9da1ff4d12f09 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -861,7 +861,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) } class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] { - def init() = { + def init() : Unit = { println("Initializing multiple text output format saver") } @@ -890,7 +890,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) */ def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) (implicit fm: ClassTag[F]) { - partitionBy(new HashPartitioner(numPartitions)). saveAsHadoopFileByKey(path) } From 3defe515ff7e337945f7687010e56246498c7ada Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Thu, 15 Jan 2015 14:59:59 -0800 Subject: [PATCH 05/25] Attempting fix --- .../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 9da1ff4d12f09..2411fa1bbbdf9 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -860,11 +860,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) } - class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] { - def init() : Unit = { - println("Initializing multiple text output format saver") - } - + class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any]() { override def generateActualKey(key: Any, value: Any): Any = { NullWritable.get() From a8199f6dd4a7518816d9be32963647d27aa282b2 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 4 Mar 2015 13:24:39 -0800 Subject: [PATCH 06/25] Updated to fix init bug --- .../apache/spark/rdd/PairRDDFunctions.scala | 17 +- .../apache/spark/sql/UDFRegistration.scala | 571 ------------------ 2 files changed, 9 insertions(+), 579 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 0db868a9d417d..6d0eeb24b605d 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -323,7 +323,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) @deprecated("Use reduceByKeyLocally", "1.0.0") def reduceByKeyToDriver(func: (V, V) => V): Map[K, V] = reduceByKeyLocally(func) - /** + /** * Count the number of elements for each key, collecting the results to a local Map. * * Note that this method should only be used if the resulting map is expected to be small, as @@ -867,7 +867,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) } - class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any]() { + class RDDMultipleTextOutputFormat() extends MultipleTextOutputFormat[Any, Any]() { + override def generateActualKey(key: Any, value: Any): Any = { NullWritable.get() @@ -881,10 +882,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) /** * TODO: This only works if the key is a java Object (can't work with primitive types) - * - * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop + * + * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop * `OutputFormat` class supporting the key and value types K and V in this RDD. - * + * * Example: * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')] * /path/prefix/B [/part-1, /part-2, etc] @@ -898,11 +899,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) } def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { - saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), - ClassUtils.primitiveToWrapper(valueClass), + saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), + ClassUtils.primitiveToWrapper(valueClass), classOf[RDDMultipleTextOutputFormat]) } - + /** * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class * supporting the key and value types K and V in this RDD. Compress the result with the diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala deleted file mode 100644 index 8051df299252c..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ /dev/null @@ -1,571 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import java.util.{List => JList, Map => JMap} - -import scala.reflect.runtime.universe.TypeTag - -import org.apache.spark.{Accumulator, Logging} -import org.apache.spark.api.python.PythonBroadcast -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.api.java._ -import org.apache.spark.sql.catalyst.ScalaReflection -import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf} -import org.apache.spark.sql.execution.PythonUDF -import org.apache.spark.sql.types.DataType - - -/** - * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this. - */ -class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging { - - private val functionRegistry = sqlContext.functionRegistry - - protected[sql] def registerPython( - name: String, - command: Array[Byte], - envVars: JMap[String, String], - pythonIncludes: JList[String], - pythonExec: String, - broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]], - stringDataType: String): Unit = { - log.debug( - s""" - | Registering new PythonUDF: - | name: $name - | command: ${command.toSeq} - | envVars: $envVars - | pythonIncludes: $pythonIncludes - | pythonExec: $pythonExec - | dataType: $stringDataType - """.stripMargin) - - - val dataType = sqlContext.parseDataType(stringDataType) - - def builder(e: Seq[Expression]) = - PythonUDF( - name, - command, - envVars, - pythonIncludes, - pythonExec, - broadcastVars, - accumulator, - dataType, - e) - - functionRegistry.registerFunction(name, builder) - } - - // scalastyle:off - - /* register 0-22 were generated by this script - - (0 to 22).map { x => - val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"}) - val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _) - println(s""" - /** - * Register a Scala closure of ${x} arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - }""") - } - - (1 to 22).foreach { i => - val extTypeArgs = (1 to i).map(_ => "_").mkString(", ") - val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ") - val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]" - val anyParams = (1 to i).map(_ => "_: Any").mkString(", ") - println(s""" - |/** - | * Register a user-defined function with ${i} arguments. - | */ - |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType) = { - | functionRegistry.registerFunction( - | name, - | (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), returnType, e)) - |}""".stripMargin) - } - */ - - /** - * Register a Scala closure of 0 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 1 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 2 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 3 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 4 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 5 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 6 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 7 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 8 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 9 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 10 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 11 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 12 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 13 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 14 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 15 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 16 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 17 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 18 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 19 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 20 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 21 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - /** - * Register a Scala closure of 22 arguments as user-defined function (UDF). - * @tparam RT return type of UDF. - */ - def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = { - val dataType = ScalaReflection.schemaFor[RT].dataType - def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) - functionRegistry.registerFunction(name, builder) - UserDefinedFunction(func, dataType) - } - - ////////////////////////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////// - - /** - * Register a user-defined function with 1 arguments. - */ - def register(name: String, f: UDF1[_, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e)) - } - - /** - * Register a user-defined function with 2 arguments. - */ - def register(name: String, f: UDF2[_, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 3 arguments. - */ - def register(name: String, f: UDF3[_, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 4 arguments. - */ - def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 5 arguments. - */ - def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 6 arguments. - */ - def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 7 arguments. - */ - def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 8 arguments. - */ - def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 9 arguments. - */ - def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 10 arguments. - */ - def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 11 arguments. - */ - def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 12 arguments. - */ - def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 13 arguments. - */ - def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 14 arguments. - */ - def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 15 arguments. - */ - def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 16 arguments. - */ - def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 17 arguments. - */ - def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 18 arguments. - */ - def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 19 arguments. - */ - def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 20 arguments. - */ - def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 21 arguments. - */ - def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - /** - * Register a user-defined function with 22 arguments. - */ - def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { - functionRegistry.registerFunction( - name, - (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) - } - - // scalastyle:on -} From aa1e6dcb8f9711a08850648fbc7bbe3caf760723 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 4 Mar 2015 14:03:07 -0800 Subject: [PATCH 07/25] Restored lost UDF Reg --- .../apache/spark/sql/UDFRegistration.scala | 571 ++++++++++++++++++ 1 file changed, 571 insertions(+) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala new file mode 100644 index 0000000000000..8051df299252c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -0,0 +1,571 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.{List => JList, Map => JMap} + +import scala.reflect.runtime.universe.TypeTag + +import org.apache.spark.{Accumulator, Logging} +import org.apache.spark.api.python.PythonBroadcast +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.api.java._ +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf} +import org.apache.spark.sql.execution.PythonUDF +import org.apache.spark.sql.types.DataType + + +/** + * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this. + */ +class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging { + + private val functionRegistry = sqlContext.functionRegistry + + protected[sql] def registerPython( + name: String, + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + pythonExec: String, + broadcastVars: JList[Broadcast[PythonBroadcast]], + accumulator: Accumulator[JList[Array[Byte]]], + stringDataType: String): Unit = { + log.debug( + s""" + | Registering new PythonUDF: + | name: $name + | command: ${command.toSeq} + | envVars: $envVars + | pythonIncludes: $pythonIncludes + | pythonExec: $pythonExec + | dataType: $stringDataType + """.stripMargin) + + + val dataType = sqlContext.parseDataType(stringDataType) + + def builder(e: Seq[Expression]) = + PythonUDF( + name, + command, + envVars, + pythonIncludes, + pythonExec, + broadcastVars, + accumulator, + dataType, + e) + + functionRegistry.registerFunction(name, builder) + } + + // scalastyle:off + + /* register 0-22 were generated by this script + + (0 to 22).map { x => + val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"}) + val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _) + println(s""" + /** + * Register a Scala closure of ${x} arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + }""") + } + + (1 to 22).foreach { i => + val extTypeArgs = (1 to i).map(_ => "_").mkString(", ") + val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ") + val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]" + val anyParams = (1 to i).map(_ => "_: Any").mkString(", ") + println(s""" + |/** + | * Register a user-defined function with ${i} arguments. + | */ + |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType) = { + | functionRegistry.registerFunction( + | name, + | (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), returnType, e)) + |}""".stripMargin) + } + */ + + /** + * Register a Scala closure of 0 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 1 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 2 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 3 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 4 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 5 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 6 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 7 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 8 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 9 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 10 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 11 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 12 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 13 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 14 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 15 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 16 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 17 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 18 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 19 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 20 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 21 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + /** + * Register a Scala closure of 22 arguments as user-defined function (UDF). + * @tparam RT return type of UDF. + */ + def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = { + val dataType = ScalaReflection.schemaFor[RT].dataType + def builder(e: Seq[Expression]) = ScalaUdf(func, dataType, e) + functionRegistry.registerFunction(name, builder) + UserDefinedFunction(func, dataType) + } + + ////////////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Register a user-defined function with 1 arguments. + */ + def register(name: String, f: UDF1[_, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e)) + } + + /** + * Register a user-defined function with 2 arguments. + */ + def register(name: String, f: UDF2[_, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 3 arguments. + */ + def register(name: String, f: UDF3[_, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 4 arguments. + */ + def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 5 arguments. + */ + def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 6 arguments. + */ + def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 7 arguments. + */ + def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 8 arguments. + */ + def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 9 arguments. + */ + def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 10 arguments. + */ + def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 11 arguments. + */ + def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 12 arguments. + */ + def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 13 arguments. + */ + def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 14 arguments. + */ + def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 15 arguments. + */ + def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 16 arguments. + */ + def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 17 arguments. + */ + def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 18 arguments. + */ + def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 19 arguments. + */ + def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 20 arguments. + */ + def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 21 arguments. + */ + def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + /** + * Register a user-defined function with 22 arguments. + */ + def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = { + functionRegistry.registerFunction( + name, + (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e)) + } + + // scalastyle:on +} From ab6d8cb420e2b366665234ff5e3de114fc287261 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Fri, 6 Mar 2015 09:54:35 -0800 Subject: [PATCH 08/25] Playing around with class --- .../scala/org/apache/spark/rdd/PairRDDFunctions.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 062d88fddd7e9..80ea31d897db4 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -867,14 +867,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) } - class RDDMultipleTextOutputFormat() extends MultipleTextOutputFormat[Any, Any]() { + class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K, V]() { - override def generateActualKey(key: Any, value: Any): Any = + override def generateActualKey(key: K, value: V): K = { - NullWritable.get() + NullWritable.get().asInstanceOf[K] } - override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = + override def generateFileNameForKeyValue(key: K, value: V, name: String): String = { key.asInstanceOf[String] } @@ -901,7 +901,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { saveAsHadoopFile(path, ClassUtils.primitiveToWrapper(keyClass), ClassUtils.primitiveToWrapper(valueClass), - classOf[RDDMultipleTextOutputFormat]) + classOf[RDDMultipleTextOutputFormat[K,V]]) } /** From 97e4a630e604b7c79ef1dd5cfdc673be5ac2407f Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 10 Mar 2015 14:40:34 -0700 Subject: [PATCH 09/25] Got around .() error by moving RDDMultipleTextOutputFormat out of PairRDDFunctions class (so it's no longer an inner class --- .../apache/spark/rdd/PairRDDFunctions.scala | 33 ++++++++-------- .../java/org/apache/spark/JavaAPISuite.java | 38 ------------------- 2 files changed, 15 insertions(+), 56 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 80ea31d897db4..65a27bd8d0729 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -52,6 +52,18 @@ import org.apache.spark.util.Utils import org.apache.spark.util.collection.CompactBuffer import org.apache.spark.util.random.StratifiedSamplingUtils +class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() { + override def generateActualKey(key: K, value: V): K = + { + NullWritable.get().asInstanceOf[K] + } + + override def generateFileNameForKeyValue(key: K, value: V, name: String): String = + { + key.asInstanceOf[String] + } +} + /** * Extra functions available on RDDs of (key, value) pairs through an implicit conversion. */ @@ -867,22 +879,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) saveAsHadoopFile(path, keyClass, valueClass, fm.runtimeClass.asInstanceOf[Class[F]]) } - class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K, V]() { - - override def generateActualKey(key: K, value: V): K = - { - NullWritable.get().asInstanceOf[K] - } - - override def generateFileNameForKeyValue(key: K, value: V, name: String): String = - { - key.asInstanceOf[String] - } - } - - /** - * TODO: This only works if the key is a java Object (can't work with primitive types) - * + /* * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop * `OutputFormat` class supporting the key and value types K and V in this RDD. * @@ -894,8 +891,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) */ def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) (implicit fm: ClassTag[F]) { - partitionBy(new HashPartitioner(numPartitions)). - saveAsHadoopFileByKey(path) + partitionBy(new HashPartitioner(numPartitions)) + .saveAsHadoopFileByKey(path) } def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 74e88c767ee07..098f446fc65e6 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -1581,42 +1581,4 @@ public void testAsyncActionErrorWrapping() throws Exception { Assert.assertTrue(future.isDone()); } - - /** - * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue, - * since that's the only artifact where Guava classes have been relocated. - */ - @Test - public void testGuavaOptional() { - // Stop the context created in setUp() and start a local-cluster one, to force usage of the - // assembly. - sc.stop(); - JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite"); - try { - JavaRDD rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3); - JavaRDD> rdd2 = rdd1.map( - new Function>() { - @Override - public Optional call(Integer i) { - return Optional.fromNullable(i); - } - }); - rdd2.collect(); - } finally { - localCluster.stop(); - } - } - - static class Class1 {} - static class Class2 {} - - @Test - public void testRegisterKryoClasses() { - SparkConf conf = new SparkConf(); - conf.registerKryoClasses(new Class[]{ Class1.class, Class2.class }); - Assert.assertEquals( - Class1.class.getName() + "," + Class2.class.getName(), - conf.get("spark.kryo.classesToRegister")); - } - } From 5d968861bc8a973289b2a9e3737dabf4ed174285 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 10 Mar 2015 16:57:45 -0700 Subject: [PATCH 10/25] Tests pass. Need to add check for file creation --- .../org/apache/spark/rdd/PairRDDFunctions.scala | 2 +- .../src/test/java/org/apache/spark/JavaAPISuite.java | 12 ++++++++++++ .../org/apache/spark/rdd/PairRDDFunctionsSuite.scala | 7 ++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 65a27bd8d0729..9d95136b16739 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -60,7 +60,7 @@ class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]( override def generateFileNameForKeyValue(key: K, value: V, name: String): String = { - key.asInstanceOf[String] + key.toString() } } diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 098f446fc65e6..3c705301ad266 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -1581,4 +1581,16 @@ public void testAsyncActionErrorWrapping() throws Exception { Assert.assertTrue(future.isDone()); } + static class Class1 {} + static class Class2 {} + + @Test + public void testRegisterKryoClasses() { + SparkConf conf = new SparkConf(); + conf.registerKryoClasses(new Class[]{ Class1.class, Class2.class }); + Assert.assertEquals( + Class1.class.getName() + "," + Class2.class.getName(), + conf.get("spark.kryo.classesToRegister")); + } + } diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 0568c475a9171..bf5306f304a2c 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -35,12 +35,17 @@ import org.scalatest.FunSuite class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { test("saveAsHadoopFileByKey should generate a text file per key") { - val pairs = sc.parallelize((1 to 20).zipWithIndex) + val keys = 1 to 20 + val pairs = sc.parallelize(keys.zipWithIndex) val fs = FileSystem.get(new Configuration()) val basePath = sc.conf.get("spark.local.dir", "/tmp") val fullPath = basePath + "testPath" fs.delete(new Path(fullPath), true) pairs.saveAsHadoopFileByKey(fullPath) + + // Test that a file was created for each key + + fs.delete(new Path(fullPath), true) } From fa07ad0f7c6385e11121c15025f602d1d3eb36c1 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 10 Mar 2015 17:12:30 -0700 Subject: [PATCH 11/25] Removing broken test --- .../java/org/apache/spark/JavaAPISuite.java | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 3c705301ad266..debc44d6aaa79 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -1581,6 +1581,32 @@ public void testAsyncActionErrorWrapping() throws Exception { Assert.assertTrue(future.isDone()); } +// +// /** +// * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue, +// * since that's the only artifact where Guava classes have been relocated. +// */ +// @Test +// public void testGuavaOptional() { +// // Stop the context created in setUp() and start a local-cluster one, to force usage of the +// // assembly. +// sc.stop(); +// JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite"); +// try { +// JavaRDD rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3); +// JavaRDD> rdd2 = rdd1.map( +// new Function>() { +// @Override +// public Optional call(Integer i) { +// return Optional.fromNullable(i); +// } +// }); +// rdd2.collect(); +// } finally { +// localCluster.stop(); +// } +// } + static class Class1 {} static class Class2 {} From 5438079df922b690226db44e1277d0c0e2e45454 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 11 Mar 2015 12:03:19 -0700 Subject: [PATCH 12/25] Added test to validate file creation --- .../spark/rdd/PairRDDFunctionsSuite.scala | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index bf5306f304a2c..b2e93697c1f2b 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -17,12 +17,14 @@ package org.apache.spark.rdd +import java.io.{BufferedReader, InputStreamReader} + import scala.collection.mutable.{ArrayBuffer, HashSet} import scala.sys.process._ import scala.util.Random import org.apache.hadoop.conf.{Configurable, Configuration} -import org.apache.hadoop.fs.{Path, FileSystem} +import org.apache.hadoop.fs.{FSDataInputStream, Path, FileSystem} import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter, @@ -36,15 +38,23 @@ import org.scalatest.FunSuite class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { test("saveAsHadoopFileByKey should generate a text file per key") { val keys = 1 to 20 - val pairs = sc.parallelize(keys.zipWithIndex) + val pairs = sc.parallelize(keys).map(s => (s, s*s)) val fs = FileSystem.get(new Configuration()) val basePath = sc.conf.get("spark.local.dir", "/tmp") - val fullPath = basePath + "testPath" + val fullPath = basePath + "/testPath" fs.delete(new Path(fullPath), true) pairs.saveAsHadoopFileByKey(fullPath) // Test that a file was created for each key - + keys.foreach(key => { + val testPath = new Path(fullPath + "/" + key) + assert(fs.exists(testPath)) + + // Read the file and test that the contents are the value + val input = fs.open(testPath) + val firstLine = new BufferedReader(new InputStreamReader(input)).readLine() + assert(firstLine.toInt.equals(key*key)) + }) fs.delete(new Path(fullPath), true) } From e68386cce31c4d3707bbae21a83e28f4d82fcc2e Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 11 Mar 2015 12:10:26 -0700 Subject: [PATCH 13/25] Moved text formatter to its own class --- .../apache/spark/rdd/PairRDDFunctions.scala | 12 ------- .../rdd/RDDMultipleTextOutputFormat.scala | 36 +++++++++++++++++++ 2 files changed, 36 insertions(+), 12 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 9d95136b16739..0a064e4d86f83 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -52,18 +52,6 @@ import org.apache.spark.util.Utils import org.apache.spark.util.collection.CompactBuffer import org.apache.spark.util.random.StratifiedSamplingUtils -class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() { - override def generateActualKey(key: K, value: V): K = - { - NullWritable.get().asInstanceOf[K] - } - - override def generateFileNameForKeyValue(key: K, value: V, name: String): String = - { - key.toString() - } -} - /** * Extra functions available on RDDs of (key, value) pairs through an implicit conversion. */ diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala b/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala new file mode 100644 index 0000000000000..ebe0eae226aee --- /dev/null +++ b/core/src/main/scala/org/apache/spark/rdd/RDDMultipleTextOutputFormat.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.rdd + +import org.apache.hadoop.io.NullWritable +import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat + +/** + * This class is used by the PairRDDFunctions class to facilitate writing out a key-value RDD + * to multiple files, organized by the keys. + */ +class RDDMultipleTextOutputFormat[K,V]() extends MultipleTextOutputFormat[K, V]() { + override def generateActualKey(key: K, value: V): K = + { + NullWritable.get().asInstanceOf[K] + } + + override def generateFileNameForKeyValue(key: K, value: V, name: String): String = + { + key.toString() + } +} From 21247a799851fa4a744335555df2a103935107ac Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 11 Mar 2015 12:32:53 -0700 Subject: [PATCH 14/25] Extended tests to write multiple values to file and ensure that they're split by line by key --- .../spark/rdd/PairRDDFunctionsSuite.scala | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index b2e93697c1f2b..ec857bef862fa 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -38,7 +38,13 @@ import org.scalatest.FunSuite class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { test("saveAsHadoopFileByKey should generate a text file per key") { val keys = 1 to 20 - val pairs = sc.parallelize(keys).map(s => (s, s*s)) + val testValues = 1 to 5 + // Generate the cartesian product of keys by test values + val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { + kv._2.map(v => (kv._1, v*kv._1)) + }) + + val pairs = sc.parallelize(pairsLocal) val fs = FileSystem.get(new Configuration()) val basePath = sc.conf.get("spark.local.dir", "/tmp") val fullPath = basePath + "/testPath" @@ -50,10 +56,14 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { val testPath = new Path(fullPath + "/" + key) assert(fs.exists(testPath)) - // Read the file and test that the contents are the value + // Read the file and test that the contents are the values matching that key split by line val input = fs.open(testPath) - val firstLine = new BufferedReader(new InputStreamReader(input)).readLine() - assert(firstLine.toInt.equals(key*key)) + val reader = new BufferedReader(new InputStreamReader(input)) + val values = new HashSet[Int] + val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) + lines.foreach(s => values += s.toInt) + + testValues.foreach(v => assert(values.contains(v*key))) }) fs.delete(new Path(fullPath), true) From 4ba633996fdd8b63bfdbf3debc083b1ff03af419 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Thu, 12 Mar 2015 10:19:46 -0700 Subject: [PATCH 15/25] Restored code in Java API suite/ --- .../java/org/apache/spark/JavaAPISuite.java | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index debc44d6aaa79..74e88c767ee07 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -1581,31 +1581,31 @@ public void testAsyncActionErrorWrapping() throws Exception { Assert.assertTrue(future.isDone()); } -// -// /** -// * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue, -// * since that's the only artifact where Guava classes have been relocated. -// */ -// @Test -// public void testGuavaOptional() { -// // Stop the context created in setUp() and start a local-cluster one, to force usage of the -// // assembly. -// sc.stop(); -// JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite"); -// try { -// JavaRDD rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3); -// JavaRDD> rdd2 = rdd1.map( -// new Function>() { -// @Override -// public Optional call(Integer i) { -// return Optional.fromNullable(i); -// } -// }); -// rdd2.collect(); -// } finally { -// localCluster.stop(); -// } -// } + + /** + * Test for SPARK-3647. This test needs to use the maven-built assembly to trigger the issue, + * since that's the only artifact where Guava classes have been relocated. + */ + @Test + public void testGuavaOptional() { + // Stop the context created in setUp() and start a local-cluster one, to force usage of the + // assembly. + sc.stop(); + JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite"); + try { + JavaRDD rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3); + JavaRDD> rdd2 = rdd1.map( + new Function>() { + @Override + public Optional call(Integer i) { + return Optional.fromNullable(i); + } + }); + rdd2.collect(); + } finally { + localCluster.stop(); + } + } static class Class1 {} static class Class2 {} From bd79fd9d4bccb7ef9de07dba194ba7fa49137c87 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Mon, 16 Mar 2015 22:57:55 -0700 Subject: [PATCH 16/25] Added python RDD saveByKey and test for Python SaveByKey --- .../apache/spark/api/java/JavaPairRDD.scala | 488 +++++++++--------- .../apache/spark/api/python/PythonRDD.scala | 44 +- .../apache/spark/rdd/PairRDDFunctions.scala | 3 + .../spark/api/python/PythonRDDSuite.scala | 54 +- 4 files changed, 353 insertions(+), 236 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 7af3538262fd6..ae54b94c4855c 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -742,270 +742,296 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) */ def lookup(key: K): JList[V] = seqAsJavaList(rdd.lookup(key)) - /** Output the RDD to any Hadoop-supported file system. */ - def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - conf: JobConf) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) + /* + * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop + * `OutputFormat` class supporting the key and value types K and V in this RDD. + * + * Example: + * [('N', 'Nick'), ('N', 'Nancy'), ('B', 'Bob'), ('B', 'Ben'), ('F', 'Frankie')] + * /path/prefix/B [/part-1, /part-2, etc] + * /path/prefix/F [/part-1, /part-2, etc] + * /path/prefix/N [/part-1, /part-2, etc] + * + * @param path - The path for the parent directory + * @param numPartitions - The number of partitions to partition to + */ + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) + (implicit fm: ClassTag[F]) { + rdd.saveAsHadoopFileByKey(path, numPartitions) } - /** Output the RDD to any Hadoop-supported file system. */ - def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F]) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass) + /* + * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop + * `OutputFormat` class supporting the key and value types K and V in this RDD. + */ + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { + rdd.saveAsHadoopFileByKey(path) } - /** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */ - def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - codec: Class[_ <: CompressionCodec]) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec) - } +/** Output the RDD to any Hadoop-supported file system. */ +def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + conf: JobConf) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) +} - /** Output the RDD to any Hadoop-supported file system. */ - def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - conf: Configuration) { - rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) - } +/** Output the RDD to any Hadoop-supported file system. */ +def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F]) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass) +} - /** - * Output the RDD to any Hadoop-supported storage system, using - * a Configuration object for that storage system. - */ - def saveAsNewAPIHadoopDataset(conf: Configuration) { - rdd.saveAsNewAPIHadoopDataset(conf) - } +/** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */ +def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + codec: Class[_ <: CompressionCodec]) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec) +} - /** Output the RDD to any Hadoop-supported file system. */ - def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F]) { - rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass) - } +/** Output the RDD to any Hadoop-supported file system. */ +def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + conf: Configuration) { + rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) +} - /** - * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for - * that storage system. The JobConf should set an OutputFormat and any output paths required - * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop - * MapReduce job. - */ - def saveAsHadoopDataset(conf: JobConf) { - rdd.saveAsHadoopDataset(conf) - } +/** + * Output the RDD to any Hadoop-supported storage system, using + * a Configuration object for that storage system. + */ +def saveAsNewAPIHadoopDataset(conf: Configuration) { + rdd.saveAsNewAPIHadoopDataset(conf) +} - /** - * Repartition the RDD according to the given partitioner and, within each resulting partition, - * sort records by their keys. - * - * This is more efficient than calling `repartition` and then sorting within each partition - * because it can push the sorting down into the shuffle machinery. - */ - def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - repartitionAndSortWithinPartitions(partitioner, comp) - } +/** Output the RDD to any Hadoop-supported file system. */ +def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F]) { + rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass) +} - /** - * Repartition the RDD according to the given partitioner and, within each resulting partition, - * sort records by their keys. - * - * This is more efficient than calling `repartition` and then sorting within each partition - * because it can push the sorting down into the shuffle machinery. - */ - def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K]) - : JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD( - new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner)) - } +/** + * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for + * that storage system. The JobConf should set an OutputFormat and any output paths required + * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop + * MapReduce job. + */ +def saveAsHadoopDataset(conf: JobConf) { + rdd.saveAsHadoopDataset(conf) +} - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements in - * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an - * ordered list of records (in the `save` case, they will be written to multiple `part-X` files - * in the filesystem, in order of the keys). - */ - def sortByKey(): JavaPairRDD[K, V] = sortByKey(true) +/** + * Repartition the RDD according to the given partitioner and, within each resulting partition, + * sort records by their keys. + * + * This is more efficient than calling `repartition` and then sorting within each partition + * because it can push the sorting down into the shuffle machinery. + */ +def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + repartitionAndSortWithinPartitions(partitioner, comp) +} - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ - def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - sortByKey(comp, ascending) - } +/** + * Repartition the RDD according to the given partitioner and, within each resulting partition, + * sort records by their keys. + * + * This is more efficient than calling `repartition` and then sorting within each partition + * because it can push the sorting down into the shuffle machinery. + */ +def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K]) + : JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD( + new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner)) +} - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ - def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - sortByKey(comp, ascending, numPartitions) - } +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements in + * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an + * ordered list of records (in the `save` case, they will be written to multiple `part-X` files + * in the filesystem, in order of the keys). + */ +def sortByKey(): JavaPairRDD[K, V] = sortByKey(true) - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ - def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true) +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ +def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + sortByKey(comp, ascending) +} - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ - def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending)) - } +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ +def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + sortByKey(comp, ascending, numPartitions) +} - /** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ - def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions)) - } +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ +def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true) - /** - * Return an RDD with the keys of each tuple. - */ - def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1)) +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ +def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending)) +} - /** - * Return an RDD with the values of each tuple. - */ - def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2)) +/** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ +def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions)) +} - /** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - * @param partitioner partitioner of the resulting RDD. - */ - def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] = - { - fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner)) - } +/** + * Return an RDD with the keys of each tuple. + */ +def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1)) - /** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - * @param numPartitions number of partitions of the resulting RDD. - */ - def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = { - fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions)) - } +/** + * Return an RDD with the values of each tuple. + */ +def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2)) - /** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - */ - def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = { - fromRDD(rdd.countApproxDistinctByKey(relativeSD)) - } +/** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + * @param partitioner partitioner of the resulting RDD. + */ +def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] = +{ + fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner)) +} - /** Assign a name to this RDD */ - def setName(name: String): JavaPairRDD[K, V] = { - rdd.setName(name) - this - } +/** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + * @param numPartitions number of partitions of the resulting RDD. + */ +def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = { + fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions)) +} + +/** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + */ +def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = { + fromRDD(rdd.countApproxDistinctByKey(relativeSD)) +} + +/** Assign a name to this RDD */ +def setName(name: String): JavaPairRDD[K, V] = { + rdd.setName(name) + this +} } object JavaPairRDD { - private[spark] - def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = { - rddToPairRDDFunctions(rdd).mapValues(asJavaIterable) - } +private[spark] +def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = { + rddToPairRDDFunctions(rdd).mapValues(asJavaIterable) +} - private[spark] - def cogroupResultToJava[K: ClassTag, V, W]( - rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = { - rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2))) - } +private[spark] +def cogroupResultToJava[K: ClassTag, V, W]( + rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = { + rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2))) +} - private[spark] - def cogroupResult2ToJava[K: ClassTag, V, W1, W2]( - rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]) - : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = { - rddToPairRDDFunctions(rdd) - .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3))) - } +private[spark] +def cogroupResult2ToJava[K: ClassTag, V, W1, W2]( + rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]) + : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = { + rddToPairRDDFunctions(rdd) + .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3))) +} - private[spark] - def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3]( - rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]) - : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = { - rddToPairRDDFunctions(rdd) - .mapValues(x => - (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4))) - } +private[spark] +def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3]( + rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]) +: RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = { + rddToPairRDDFunctions(rdd) + .mapValues(x => + (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4))) +} - def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = { - new JavaPairRDD[K, V](rdd) - } +def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = { + new JavaPairRDD[K, V](rdd) +} - implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd +implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd - private[spark] - implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = { - (x: T1, x1: T2) => fun.call(x, x1) - } +private[spark] +implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = { + (x: T1, x1: T2) => fun.call(x, x1) +} - private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x) +private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x) - private[spark] - implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y) +private[spark] +implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y) - /** Convert a JavaRDD of key-value pairs to JavaPairRDD. */ - def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = { - implicit val ctagK: ClassTag[K] = fakeClassTag - implicit val ctagV: ClassTag[V] = fakeClassTag - new JavaPairRDD[K, V](rdd.rdd) - } +/** Convert a JavaRDD of key-value pairs to JavaPairRDD. */ +def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = { + implicit val ctagK: ClassTag[K] = fakeClassTag + implicit val ctagV: ClassTag[V] = fakeClassTag + new JavaPairRDD[K, V](rdd.rdd) +} } diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index b1cec0f6472b0..acd9b25c4e328 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -21,6 +21,7 @@ import java.io._ import java.net._ import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, UUID, Collections} +import org.apache.commons.lang.ClassUtils import org.apache.spark.input.PortableDataStream import scala.collection.JavaConversions._ @@ -36,7 +37,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, OutputFormat import org.apache.spark._ import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.{RDDMultipleTextOutputFormat, RDD} import org.apache.spark.util.Utils private[spark] class PythonRDD( @@ -678,6 +679,43 @@ private[spark] object PythonRDD extends Logging { converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec) } + /** + * Output a Python RDD of key-value pairs to any Hadoop file system such that the values within + * the rdd are written to sub-directories organized by the associated key. + * + * Keys and values are converted to suitable output types using either user specified converters + * or, if not specified, [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion + * types `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in + * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of + * this RDD. + */ + def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( + pyRDD: JavaRDD[Array[Byte]], + batchSerialized: Boolean, + path: String, + outputFormatClass: String, + keyClass: String, + valueClass: String, + keyConverterClass: String, + valueConverterClass: String, + confAsMap: java.util.HashMap[String, String], + compressionCodecClass: String) = { + val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) + val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( + inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) + val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) + val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) + val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, + new JavaToWritableConverter) + + converted.saveAsHadoopFile(path, + ClassUtils.primitiveToWrapper(kc), + ClassUtils.primitiveToWrapper(vc), + classOf[RDDMultipleTextOutputFormat[K,V]], + new JobConf(mergedConf), + codec=codec) + } + /** * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output @@ -749,10 +787,10 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort: val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536) - /** + /** * We try to reuse a single Socket to transfer accumulator updates, as they are all added * by the DAGScheduler's single-threaded actor anyway. - */ + */ @transient var socket: Socket = _ def openSocket(): Socket = synchronized { diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 0a064e4d86f83..be70f4031aa99 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -876,6 +876,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * /path/prefix/B [/part-1, /part-2, etc] * /path/prefix/F [/part-1, /part-2, etc] * /path/prefix/N [/part-1, /part-2, etc] + * + * @param path - The path for the parent directory + * @param numPartitions - The number of partitions to partition to */ def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) (implicit fm: ClassTag[F]) { diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index c63d834f9048b..551438b5b395b 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -17,11 +17,18 @@ package org.apache.spark.api.python -import java.io.{ByteArrayOutputStream, DataOutputStream} +import java.io.{BufferedReader, ByteArrayOutputStream, DataOutputStream, InputStreamReader} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SharedSparkContext +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.rdd.RDDMultipleTextOutputFormat import org.scalatest.FunSuite -class PythonRDDSuite extends FunSuite { +import scala.collection.mutable.HashSet + +class PythonRDDSuite extends FunSuite with SharedSparkContext{ test("Writing large strings to the worker") { val input: List[String] = List("a"*100000) @@ -41,4 +48,47 @@ class PythonRDDSuite extends FunSuite { PythonRDD.writeIteratorToStream( Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer) } + + test("saveAsHadoopFileByKey should generate a text file per key") { + val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( + Seq( + Array(1.toByte,1.toByte), + Array(2.toByte,4.toByte), + Array(3.toByte,9.toByte), + Array(4.toByte,16.toByte), + Array(5.toByte,25.toByte)) + ).toJavaRDD() + + val fs = FileSystem.get(new Configuration()) + val basePath = sc.conf.get("spark.local.dir", "/tmp") + val fullPath = basePath + "/testPath" + fs.delete(new Path(fullPath), true) + + PythonRDD.saveAsHadoopFileByKey( + testPairs, + false, fullPath, + classOf[RDDMultipleTextOutputFormat].toString, + classOf[Int].toString, + classOf[Int].toString, + null, + null, + new java.util.HashMap(), "") + + // Test that a file was created for each key + (1 to 5).foreach(key => { + val testPath = new Path(fullPath + "/" + key) + assert(fs.exists(testPath)) + + // Read the file and test that the contents are the values matching that key split by line + val input = fs.open(testPath) + val reader = new BufferedReader(new InputStreamReader(input)) + val values = new HashSet[Int] + val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) + lines.foreach(s => values += s.toInt) + + assert(values.contains(key*key)) + }) + + fs.delete(new Path(fullPath), true) + } } From 00e419ce58c45cba3cb5c08c604e4e236ab5a620 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 17 Mar 2015 12:57:10 -0700 Subject: [PATCH 17/25] [SPARK-3533] Added test for JavaPairRDD API --- .../spark/rdd/PairRDDFunctionsSuite.scala | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index ec857bef862fa..84a1f8dffb5af 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.rdd import java.io.{BufferedReader, InputStreamReader} +import org.apache.spark.api.java.JavaPairRDD + import scala.collection.mutable.{ArrayBuffer, HashSet} import scala.sys.process._ import scala.util.Random @@ -69,6 +71,40 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { fs.delete(new Path(fullPath), true) } + test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") { + val keys = 1 to 20 + val testValues = 1 to 5 + // Generate the cartesian product of keys by test values + val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { + kv._2.map(v => (kv._1, v*kv._1)) + }) + + val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal)) + + val fs = FileSystem.get(new Configuration()) + val basePath = sc.conf.get("spark.local.dir", "/tmp") + val fullPath = basePath + "/testPath" + fs.delete(new Path(fullPath), true) + pairs.saveAsHadoopFileByKey(fullPath) + + // Test that a file was created for each key + keys.foreach(key => { + val testPath = new Path(fullPath + "/" + key) + assert(fs.exists(testPath)) + + // Read the file and test that the contents are the values matching that key split by line + val input = fs.open(testPath) + val reader = new BufferedReader(new InputStreamReader(input)) + val values = new HashSet[Int] + val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) + lines.foreach(s => values += s.toInt) + + testValues.foreach(v => assert(values.contains(v*key))) + }) + + fs.delete(new Path(fullPath), true) + } + test("aggregateByKey") { val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) From 2ade1184722caf4a99e9ab1e09881e5c5cb68a39 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 17 Mar 2015 14:29:55 -0700 Subject: [PATCH 18/25] reordering to simplify diff --- .../apache/spark/api/java/JavaPairRDD.scala | 398 +++++++++--------- .../spark/api/python/PythonRDDSuite.scala | 3 +- 2 files changed, 201 insertions(+), 200 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index ae54b94c4855c..3ec4b3ad2134a 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -742,6 +742,72 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) */ def lookup(key: K): JList[V] = seqAsJavaList(rdd.lookup(key)) + /** Output the RDD to any Hadoop-supported file system. */ + def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + conf: JobConf) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) + } + + /** Output the RDD to any Hadoop-supported file system. */ + def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F]) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass) + } + + /** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */ + def saveAsHadoopFile[F <: OutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + codec: Class[_ <: CompressionCodec]) { + rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec) + } + + /** Output the RDD to any Hadoop-supported file system. */ + def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F], + conf: Configuration) { + rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) + } + + /** + * Output the RDD to any Hadoop-supported storage system, using + * a Configuration object for that storage system. + */ + def saveAsNewAPIHadoopDataset(conf: Configuration) { + rdd.saveAsNewAPIHadoopDataset(conf) + } + + /** Output the RDD to any Hadoop-supported file system. */ + def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( + path: String, + keyClass: Class[_], + valueClass: Class[_], + outputFormatClass: Class[F]) { + rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass) + } + + /** + * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for + * that storage system. The JobConf should set an OutputFormat and any output paths required + * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop + * MapReduce job. + */ + def saveAsHadoopDataset(conf: JobConf) { + rdd.saveAsHadoopDataset(conf) + } + /* * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop * `OutputFormat` class supporting the key and value types K and V in this RDD. @@ -768,218 +834,152 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) rdd.saveAsHadoopFileByKey(path) } -/** Output the RDD to any Hadoop-supported file system. */ -def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - conf: JobConf) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) -} - -/** Output the RDD to any Hadoop-supported file system. */ -def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F]) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass) -} - -/** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */ -def saveAsHadoopFile[F <: OutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - codec: Class[_ <: CompressionCodec]) { - rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec) -} - -/** Output the RDD to any Hadoop-supported file system. */ -def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F], - conf: Configuration) { - rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) -} - -/** - * Output the RDD to any Hadoop-supported storage system, using - * a Configuration object for that storage system. - */ -def saveAsNewAPIHadoopDataset(conf: Configuration) { - rdd.saveAsNewAPIHadoopDataset(conf) -} - -/** Output the RDD to any Hadoop-supported file system. */ -def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]]( - path: String, - keyClass: Class[_], - valueClass: Class[_], - outputFormatClass: Class[F]) { - rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass) -} - -/** - * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for - * that storage system. The JobConf should set an OutputFormat and any output paths required - * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop - * MapReduce job. - */ -def saveAsHadoopDataset(conf: JobConf) { - rdd.saveAsHadoopDataset(conf) -} - -/** - * Repartition the RDD according to the given partitioner and, within each resulting partition, - * sort records by their keys. - * - * This is more efficient than calling `repartition` and then sorting within each partition - * because it can push the sorting down into the shuffle machinery. - */ -def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - repartitionAndSortWithinPartitions(partitioner, comp) -} + /** + * Repartition the RDD according to the given partitioner and, within each resulting partition, + * sort records by their keys. + * + * This is more efficient than calling `repartition` and then sorting within each partition + * because it can push the sorting down into the shuffle machinery. + */ + def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + repartitionAndSortWithinPartitions(partitioner, comp) + } -/** - * Repartition the RDD according to the given partitioner and, within each resulting partition, - * sort records by their keys. - * - * This is more efficient than calling `repartition` and then sorting within each partition - * because it can push the sorting down into the shuffle machinery. - */ -def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K]) - : JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD( - new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner)) -} + /** + * Repartition the RDD according to the given partitioner and, within each resulting partition, + * sort records by their keys. + * + * This is more efficient than calling `repartition` and then sorting within each partition + * because it can push the sorting down into the shuffle machinery. + */ + def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K]) + : JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD( + new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner)) + } -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements in - * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an - * ordered list of records (in the `save` case, they will be written to multiple `part-X` files - * in the filesystem, in order of the keys). - */ -def sortByKey(): JavaPairRDD[K, V] = sortByKey(true) + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements in + * ascending order. Calling `collect` or `save` on the resulting RDD will return or output an + * ordered list of records (in the `save` case, they will be written to multiple `part-X` files + * in the filesystem, in order of the keys). + */ + def sortByKey(): JavaPairRDD[K, V] = sortByKey(true) -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ -def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - sortByKey(comp, ascending) -} + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ + def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + sortByKey(comp, ascending) + } -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ -def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { - val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - sortByKey(comp, ascending, numPartitions) -} + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ + def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { + val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] + sortByKey(comp, ascending, numPartitions) + } -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ -def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true) + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ + def sortByKey(comp: Comparator[K]): JavaPairRDD[K, V] = sortByKey(comp, true) -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ -def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending)) -} + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ + def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending)) + } -/** - * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling - * `collect` or `save` on the resulting RDD will return or output an ordered list of records - * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in - * order of the keys). - */ -def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { - implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. - fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions)) -} + /** + * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling + * `collect` or `save` on the resulting RDD will return or output an ordered list of records + * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in + * order of the keys). + */ + def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = { + implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering. + fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions)) + } -/** - * Return an RDD with the keys of each tuple. - */ -def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1)) + /** + * Return an RDD with the keys of each tuple. + */ + def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1)) -/** - * Return an RDD with the values of each tuple. - */ -def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2)) + /** + * Return an RDD with the values of each tuple. + */ + def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2)) -/** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - * @param partitioner partitioner of the resulting RDD. - */ -def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] = -{ - fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner)) -} + /** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + * @param partitioner partitioner of the resulting RDD. + */ + def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] = + { + fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner)) + } -/** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - * @param numPartitions number of partitions of the resulting RDD. - */ -def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = { - fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions)) -} + /** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + * @param numPartitions number of partitions of the resulting RDD. + */ + def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = { + fromRDD(rdd.countApproxDistinctByKey(relativeSD, numPartitions)) + } -/** - * Return approximate number of distinct values for each key in this RDD. - * - * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: - * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available - * here. - * - * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It must be greater than 0.000017. - */ -def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = { - fromRDD(rdd.countApproxDistinctByKey(relativeSD)) -} + /** + * Return approximate number of distinct values for each key in this RDD. + * + * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: + * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available + * here. + * + * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. + */ + def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = { + fromRDD(rdd.countApproxDistinctByKey(relativeSD)) + } -/** Assign a name to this RDD */ -def setName(name: String): JavaPairRDD[K, V] = { - rdd.setName(name) - this -} + /** Assign a name to this RDD */ + def setName(name: String): JavaPairRDD[K, V] = { + rdd.setName(name) + this + } } object JavaPairRDD { diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index 551438b5b395b..f536c7f1ca70c 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -66,7 +66,8 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{ PythonRDD.saveAsHadoopFileByKey( testPairs, - false, fullPath, + false, + fullPath, classOf[RDDMultipleTextOutputFormat].toString, classOf[Int].toString, classOf[Int].toString, From 89215625eba2970da213efa9c6d98891f9b890c8 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 17 Mar 2015 14:31:10 -0700 Subject: [PATCH 19/25] spacing fix --- .../apache/spark/api/java/JavaPairRDD.scala | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 3ec4b3ad2134a..e8bd18c57d2c5 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -983,55 +983,55 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) } object JavaPairRDD { -private[spark] -def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = { - rddToPairRDDFunctions(rdd).mapValues(asJavaIterable) -} + private[spark] + def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = { + rddToPairRDDFunctions(rdd).mapValues(asJavaIterable) + } -private[spark] -def cogroupResultToJava[K: ClassTag, V, W]( - rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = { - rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2))) -} + private[spark] + def cogroupResultToJava[K: ClassTag, V, W]( + rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = { + rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2))) + } -private[spark] -def cogroupResult2ToJava[K: ClassTag, V, W1, W2]( - rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]) - : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = { - rddToPairRDDFunctions(rdd) - .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3))) -} + private[spark] + def cogroupResult2ToJava[K: ClassTag, V, W1, W2]( + rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]) + : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = { + rddToPairRDDFunctions(rdd) + .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3))) + } -private[spark] -def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3]( - rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]) -: RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = { - rddToPairRDDFunctions(rdd) - .mapValues(x => - (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4))) -} + private[spark] + def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3]( + rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]) + : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = { + rddToPairRDDFunctions(rdd) + .mapValues(x => + (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4))) + } -def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = { - new JavaPairRDD[K, V](rdd) -} + def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = { + new JavaPairRDD[K, V](rdd) + } -implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd + implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd -private[spark] -implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = { - (x: T1, x1: T2) => fun.call(x, x1) -} + private[spark] + implicit def toScalaFunction2[T1, T2, R](fun: JFunction2[T1, T2, R]): Function2[T1, T2, R] = { + (x: T1, x1: T2) => fun.call(x, x1) + } -private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x) + private[spark] implicit def toScalaFunction[T, R](fun: JFunction[T, R]): T => R = x => fun.call(x) -private[spark] -implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y) + private[spark] + implicit def pairFunToScalaFun[A, B, C](x: PairFunction[A, B, C]): A => (B, C) = y => x.call(y) -/** Convert a JavaRDD of key-value pairs to JavaPairRDD. */ -def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = { - implicit val ctagK: ClassTag[K] = fakeClassTag - implicit val ctagV: ClassTag[V] = fakeClassTag - new JavaPairRDD[K, V](rdd.rdd) -} + /** Convert a JavaRDD of key-value pairs to JavaPairRDD. */ + def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = { + implicit val ctagK: ClassTag[K] = fakeClassTag + implicit val ctagV: ClassTag[V] = fakeClassTag + new JavaPairRDD[K, V](rdd.rdd) + } } From 1ed32184e03188292c0d0587dad0669b61eeba51 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Tue, 17 Mar 2015 14:35:22 -0700 Subject: [PATCH 20/25] [SPARK-3533]Input ordering --- .../scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 84a1f8dffb5af..f2035e0ad4bca 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.rdd import java.io.{BufferedReader, InputStreamReader} -import org.apache.spark.api.java.JavaPairRDD - import scala.collection.mutable.{ArrayBuffer, HashSet} import scala.sys.process._ import scala.util.Random @@ -31,9 +29,11 @@ import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{JobContext => NewJobContext, OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext} +import org.apache.hadoop.util.Progressable + +import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.{Partitioner, SharedSparkContext} import org.apache.spark.util.Utils -import org.apache.hadoop.util.Progressable import org.scalatest.FunSuite From df6d89c746d947367bb2bc68471cf41fc5021096 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 18 Mar 2015 06:40:18 -0700 Subject: [PATCH 21/25] Attempting to fix build --- .../apache/spark/api/java/JavaPairRDD.scala | 24 ++--- .../apache/spark/api/python/PythonRDD.scala | 54 +++++------ .../spark/api/python/PythonRDDSuite.scala | 90 +++++++++---------- .../spark/rdd/PairRDDFunctionsSuite.scala | 66 +++++++------- 4 files changed, 117 insertions(+), 117 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 65637a63bfd3f..6d37fd5ab8061 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -821,18 +821,18 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * @param path - The path for the parent directory * @param numPartitions - The number of partitions to partition to */ - def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) - (implicit fm: ClassTag[F]) { - rdd.saveAsHadoopFileByKey(path, numPartitions) - } - - /* - * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop - * `OutputFormat` class supporting the key and value types K and V in this RDD. - */ - def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { - rdd.saveAsHadoopFileByKey(path) - } +// def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) +// (implicit fm: ClassTag[F]) { +// rdd.saveAsHadoopFileByKey(path, numPartitions) +// } +// +// /* +// * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop +// * `OutputFormat` class supporting the key and value types K and V in this RDD. +// */ +// def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { +// rdd.saveAsHadoopFileByKey(path) +// } /** * Repartition the RDD according to the given partitioner and, within each resulting partition, diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index ea3faba240321..8a153d49f9600 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -19,7 +19,7 @@ package org.apache.spark.api.python import java.io._ import java.net._ -import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID} +import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConversions._ import scala.collection.mutable @@ -734,32 +734,32 @@ private[spark] object PythonRDD extends Logging { * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of * this RDD. */ - def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( - pyRDD: JavaRDD[Array[Byte]], - batchSerialized: Boolean, - path: String, - outputFormatClass: String, - keyClass: String, - valueClass: String, - keyConverterClass: String, - valueConverterClass: String, - confAsMap: java.util.HashMap[String, String], - compressionCodecClass: String) = { - val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) - val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( - inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) - val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) - val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) - val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, - new JavaToWritableConverter) - - converted.saveAsHadoopFile(path, - ClassUtils.primitiveToWrapper(kc), - ClassUtils.primitiveToWrapper(vc), - classOf[RDDMultipleTextOutputFormat[K,V]], - new JobConf(mergedConf), - codec=codec) - } +// def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( +// pyRDD: JavaRDD[Array[Byte]], +// batchSerialized: Boolean, +// path: String, +// outputFormatClass: String, +// keyClass: String, +// valueClass: String, +// keyConverterClass: String, +// valueConverterClass: String, +// confAsMap: java.util.HashMap[String, String], +// compressionCodecClass: String) = { +// val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) +// val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( +// inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) +// val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) +// val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) +// val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, +// new JavaToWritableConverter) +// +// converted.saveAsHadoopFile(path, +// ClassUtils.primitiveToWrapper(kc), +// ClassUtils.primitiveToWrapper(vc), +// classOf[RDDMultipleTextOutputFormat[K,V]], +// new JobConf(mergedConf), +// codec=codec) +// } /** * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index f536c7f1ca70c..14f5f00f22f2e 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -28,7 +28,7 @@ import org.scalatest.FunSuite import scala.collection.mutable.HashSet -class PythonRDDSuite extends FunSuite with SharedSparkContext{ +class PythonRDDSuite extends FunSuite { //with SharedSparkContext{ test("Writing large strings to the worker") { val input: List[String] = List("a"*100000) @@ -48,48 +48,48 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{ PythonRDD.writeIteratorToStream( Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer) } - - test("saveAsHadoopFileByKey should generate a text file per key") { - val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( - Seq( - Array(1.toByte,1.toByte), - Array(2.toByte,4.toByte), - Array(3.toByte,9.toByte), - Array(4.toByte,16.toByte), - Array(5.toByte,25.toByte)) - ).toJavaRDD() - - val fs = FileSystem.get(new Configuration()) - val basePath = sc.conf.get("spark.local.dir", "/tmp") - val fullPath = basePath + "/testPath" - fs.delete(new Path(fullPath), true) - - PythonRDD.saveAsHadoopFileByKey( - testPairs, - false, - fullPath, - classOf[RDDMultipleTextOutputFormat].toString, - classOf[Int].toString, - classOf[Int].toString, - null, - null, - new java.util.HashMap(), "") - - // Test that a file was created for each key - (1 to 5).foreach(key => { - val testPath = new Path(fullPath + "/" + key) - assert(fs.exists(testPath)) - - // Read the file and test that the contents are the values matching that key split by line - val input = fs.open(testPath) - val reader = new BufferedReader(new InputStreamReader(input)) - val values = new HashSet[Int] - val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) - lines.foreach(s => values += s.toInt) - - assert(values.contains(key*key)) - }) - - fs.delete(new Path(fullPath), true) - } +// +// test("saveAsHadoopFileByKey should generate a text file per key") { +// val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( +// Seq( +// Array(1.toByte,1.toByte), +// Array(2.toByte,4.toByte), +// Array(3.toByte,9.toByte), +// Array(4.toByte,16.toByte), +// Array(5.toByte,25.toByte)) +// ).toJavaRDD() +// +// val fs = FileSystem.get(new Configuration()) +// val basePath = sc.conf.get("spark.local.dir", "/tmp") +// val fullPath = basePath + "/testPath" +// fs.delete(new Path(fullPath), true) +// +// PythonRDD.saveAsHadoopFileByKey( +// testPairs, +// false, +// fullPath, +// classOf[RDDMultipleTextOutputFormat].toString, +// classOf[Int].toString, +// classOf[Int].toString, +// null, +// null, +// new java.util.HashMap(), "") +// +// // Test that a file was created for each key +// (1 to 5).foreach(key => { +// val testPath = new Path(fullPath + "/" + key) +// assert(fs.exists(testPath)) +// +// // Read the file and test that the contents are the values matching that key split by line +// val input = fs.open(testPath) +// val reader = new BufferedReader(new InputStreamReader(input)) +// val values = new HashSet[Int] +// val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) +// lines.foreach(s => values += s.toInt) +// +// assert(values.contains(key*key)) +// }) +// +// fs.delete(new Path(fullPath), true) +// } } diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index f2035e0ad4bca..e4d00b2ea3181 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -71,39 +71,39 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { fs.delete(new Path(fullPath), true) } - test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") { - val keys = 1 to 20 - val testValues = 1 to 5 - // Generate the cartesian product of keys by test values - val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { - kv._2.map(v => (kv._1, v*kv._1)) - }) - - val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal)) - - val fs = FileSystem.get(new Configuration()) - val basePath = sc.conf.get("spark.local.dir", "/tmp") - val fullPath = basePath + "/testPath" - fs.delete(new Path(fullPath), true) - pairs.saveAsHadoopFileByKey(fullPath) - - // Test that a file was created for each key - keys.foreach(key => { - val testPath = new Path(fullPath + "/" + key) - assert(fs.exists(testPath)) - - // Read the file and test that the contents are the values matching that key split by line - val input = fs.open(testPath) - val reader = new BufferedReader(new InputStreamReader(input)) - val values = new HashSet[Int] - val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) - lines.foreach(s => values += s.toInt) - - testValues.foreach(v => assert(values.contains(v*key))) - }) - - fs.delete(new Path(fullPath), true) - } +// test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") { +// val keys = 1 to 20 +// val testValues = 1 to 5 +// // Generate the cartesian product of keys by test values +// val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { +// kv._2.map(v => (kv._1, v*kv._1)) +// }) +// +// val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal)) +// +// val fs = FileSystem.get(new Configuration()) +// val basePath = sc.conf.get("spark.local.dir", "/tmp") +// val fullPath = basePath + "/testPath" +// fs.delete(new Path(fullPath), true) +// pairs.saveAsHadoopFileByKey(fullPath) +// +// // Test that a file was created for each key +// keys.foreach(key => { +// val testPath = new Path(fullPath + "/" + key) +// assert(fs.exists(testPath)) +// +// // Read the file and test that the contents are the values matching that key split by line +// val input = fs.open(testPath) +// val reader = new BufferedReader(new InputStreamReader(input)) +// val values = new HashSet[Int] +// val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) +// lines.foreach(s => values += s.toInt) +// +// testValues.foreach(v => assert(values.contains(v*key))) +// }) +// +// fs.delete(new Path(fullPath), true) +// } test("aggregateByKey") { val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) From f185ec3b305308f967795161cf89813e12ddc723 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 18 Mar 2015 06:52:48 -0700 Subject: [PATCH 22/25] Revert "Attempting to fix build" This reverts commit df6d89c746d947367bb2bc68471cf41fc5021096. --- .../apache/spark/api/java/JavaPairRDD.scala | 24 ++--- .../apache/spark/api/python/PythonRDD.scala | 54 +++++------ .../spark/api/python/PythonRDDSuite.scala | 90 +++++++++---------- .../spark/rdd/PairRDDFunctionsSuite.scala | 66 +++++++------- 4 files changed, 117 insertions(+), 117 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 6d37fd5ab8061..65637a63bfd3f 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -821,18 +821,18 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * @param path - The path for the parent directory * @param numPartitions - The number of partitions to partition to */ -// def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) -// (implicit fm: ClassTag[F]) { -// rdd.saveAsHadoopFileByKey(path, numPartitions) -// } -// -// /* -// * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop -// * `OutputFormat` class supporting the key and value types K and V in this RDD. -// */ -// def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { -// rdd.saveAsHadoopFileByKey(path) -// } + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String, numPartitions : Int) + (implicit fm: ClassTag[F]) { + rdd.saveAsHadoopFileByKey(path, numPartitions) + } + + /* + * Output the RDD to multiple files by key on any Hadoop-supported file system, using a Hadoop + * `OutputFormat` class supporting the key and value types K and V in this RDD. + */ + def saveAsHadoopFileByKey[F <: OutputFormat[K, V]](path: String)(implicit fm: ClassTag[F]) { + rdd.saveAsHadoopFileByKey(path) + } /** * Repartition the RDD according to the given partitioner and, within each resulting partition, diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 8a153d49f9600..ea3faba240321 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -19,7 +19,7 @@ package org.apache.spark.api.python import java.io._ import java.net._ -import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} +import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID} import scala.collection.JavaConversions._ import scala.collection.mutable @@ -734,32 +734,32 @@ private[spark] object PythonRDD extends Logging { * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of * this RDD. */ -// def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( -// pyRDD: JavaRDD[Array[Byte]], -// batchSerialized: Boolean, -// path: String, -// outputFormatClass: String, -// keyClass: String, -// valueClass: String, -// keyConverterClass: String, -// valueConverterClass: String, -// confAsMap: java.util.HashMap[String, String], -// compressionCodecClass: String) = { -// val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) -// val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( -// inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) -// val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) -// val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) -// val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, -// new JavaToWritableConverter) -// -// converted.saveAsHadoopFile(path, -// ClassUtils.primitiveToWrapper(kc), -// ClassUtils.primitiveToWrapper(vc), -// classOf[RDDMultipleTextOutputFormat[K,V]], -// new JobConf(mergedConf), -// codec=codec) -// } + def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( + pyRDD: JavaRDD[Array[Byte]], + batchSerialized: Boolean, + path: String, + outputFormatClass: String, + keyClass: String, + valueClass: String, + keyConverterClass: String, + valueConverterClass: String, + confAsMap: java.util.HashMap[String, String], + compressionCodecClass: String) = { + val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) + val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( + inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) + val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) + val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) + val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, + new JavaToWritableConverter) + + converted.saveAsHadoopFile(path, + ClassUtils.primitiveToWrapper(kc), + ClassUtils.primitiveToWrapper(vc), + classOf[RDDMultipleTextOutputFormat[K,V]], + new JobConf(mergedConf), + codec=codec) + } /** * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index 14f5f00f22f2e..f536c7f1ca70c 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -28,7 +28,7 @@ import org.scalatest.FunSuite import scala.collection.mutable.HashSet -class PythonRDDSuite extends FunSuite { //with SharedSparkContext{ +class PythonRDDSuite extends FunSuite with SharedSparkContext{ test("Writing large strings to the worker") { val input: List[String] = List("a"*100000) @@ -48,48 +48,48 @@ class PythonRDDSuite extends FunSuite { //with SharedSparkContext{ PythonRDD.writeIteratorToStream( Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer) } -// -// test("saveAsHadoopFileByKey should generate a text file per key") { -// val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( -// Seq( -// Array(1.toByte,1.toByte), -// Array(2.toByte,4.toByte), -// Array(3.toByte,9.toByte), -// Array(4.toByte,16.toByte), -// Array(5.toByte,25.toByte)) -// ).toJavaRDD() -// -// val fs = FileSystem.get(new Configuration()) -// val basePath = sc.conf.get("spark.local.dir", "/tmp") -// val fullPath = basePath + "/testPath" -// fs.delete(new Path(fullPath), true) -// -// PythonRDD.saveAsHadoopFileByKey( -// testPairs, -// false, -// fullPath, -// classOf[RDDMultipleTextOutputFormat].toString, -// classOf[Int].toString, -// classOf[Int].toString, -// null, -// null, -// new java.util.HashMap(), "") -// -// // Test that a file was created for each key -// (1 to 5).foreach(key => { -// val testPath = new Path(fullPath + "/" + key) -// assert(fs.exists(testPath)) -// -// // Read the file and test that the contents are the values matching that key split by line -// val input = fs.open(testPath) -// val reader = new BufferedReader(new InputStreamReader(input)) -// val values = new HashSet[Int] -// val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) -// lines.foreach(s => values += s.toInt) -// -// assert(values.contains(key*key)) -// }) -// -// fs.delete(new Path(fullPath), true) -// } + + test("saveAsHadoopFileByKey should generate a text file per key") { + val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( + Seq( + Array(1.toByte,1.toByte), + Array(2.toByte,4.toByte), + Array(3.toByte,9.toByte), + Array(4.toByte,16.toByte), + Array(5.toByte,25.toByte)) + ).toJavaRDD() + + val fs = FileSystem.get(new Configuration()) + val basePath = sc.conf.get("spark.local.dir", "/tmp") + val fullPath = basePath + "/testPath" + fs.delete(new Path(fullPath), true) + + PythonRDD.saveAsHadoopFileByKey( + testPairs, + false, + fullPath, + classOf[RDDMultipleTextOutputFormat].toString, + classOf[Int].toString, + classOf[Int].toString, + null, + null, + new java.util.HashMap(), "") + + // Test that a file was created for each key + (1 to 5).foreach(key => { + val testPath = new Path(fullPath + "/" + key) + assert(fs.exists(testPath)) + + // Read the file and test that the contents are the values matching that key split by line + val input = fs.open(testPath) + val reader = new BufferedReader(new InputStreamReader(input)) + val values = new HashSet[Int] + val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) + lines.foreach(s => values += s.toInt) + + assert(values.contains(key*key)) + }) + + fs.delete(new Path(fullPath), true) + } } diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index e4d00b2ea3181..f2035e0ad4bca 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -71,39 +71,39 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext { fs.delete(new Path(fullPath), true) } -// test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") { -// val keys = 1 to 20 -// val testValues = 1 to 5 -// // Generate the cartesian product of keys by test values -// val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { -// kv._2.map(v => (kv._1, v*kv._1)) -// }) -// -// val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal)) -// -// val fs = FileSystem.get(new Configuration()) -// val basePath = sc.conf.get("spark.local.dir", "/tmp") -// val fullPath = basePath + "/testPath" -// fs.delete(new Path(fullPath), true) -// pairs.saveAsHadoopFileByKey(fullPath) -// -// // Test that a file was created for each key -// keys.foreach(key => { -// val testPath = new Path(fullPath + "/" + key) -// assert(fs.exists(testPath)) -// -// // Read the file and test that the contents are the values matching that key split by line -// val input = fs.open(testPath) -// val reader = new BufferedReader(new InputStreamReader(input)) -// val values = new HashSet[Int] -// val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) -// lines.foreach(s => values += s.toInt) -// -// testValues.foreach(v => assert(values.contains(v*key))) -// }) -// -// fs.delete(new Path(fullPath), true) -// } + test("JavaPairRDD.saveAsHadoopFileByKey should generate a text file per key") { + val keys = 1 to 20 + val testValues = 1 to 5 + // Generate the cartesian product of keys by test values + val pairsLocal = keys.map(k => (k, testValues)).flatMap(kv => { + kv._2.map(v => (kv._1, v*kv._1)) + }) + + val pairs = JavaPairRDD.fromRDD(sc.parallelize(pairsLocal)) + + val fs = FileSystem.get(new Configuration()) + val basePath = sc.conf.get("spark.local.dir", "/tmp") + val fullPath = basePath + "/testPath" + fs.delete(new Path(fullPath), true) + pairs.saveAsHadoopFileByKey(fullPath) + + // Test that a file was created for each key + keys.foreach(key => { + val testPath = new Path(fullPath + "/" + key) + assert(fs.exists(testPath)) + + // Read the file and test that the contents are the values matching that key split by line + val input = fs.open(testPath) + val reader = new BufferedReader(new InputStreamReader(input)) + val values = new HashSet[Int] + val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) + lines.foreach(s => values += s.toInt) + + testValues.foreach(v => assert(values.contains(v*key))) + }) + + fs.delete(new Path(fullPath), true) + } test("aggregateByKey") { val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) From c04efeae2728f663a3848be4a526de480e121185 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 18 Mar 2015 06:55:12 -0700 Subject: [PATCH 23/25] Fixing compilation error --- .../test/scala/org/apache/spark/api/python/PythonRDDSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index f536c7f1ca70c..7024bbc89d803 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -68,7 +68,7 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{ testPairs, false, fullPath, - classOf[RDDMultipleTextOutputFormat].toString, + classOf[RDDMultipleTextOutputFormat[Int, Int]].toString, classOf[Int].toString, classOf[Int].toString, null, From 56c2d39c97e86b691758b0490f7a019b0e396983 Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 18 Mar 2015 10:32:51 -0700 Subject: [PATCH 24/25] Removing python test and wrapper --- .../apache/spark/api/python/PythonRDD.scala | 42 +-------------- .../spark/api/python/PythonRDDSuite.scala | 52 +------------------ 2 files changed, 3 insertions(+), 91 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index ea3faba240321..f43778498a805 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -19,14 +19,13 @@ package org.apache.spark.api.python import java.io._ import java.net._ -import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap, UUID} +import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConversions._ import scala.collection.mutable import scala.language.existentials import com.google.common.base.Charsets.UTF_8 -import org.apache.commons.lang.ClassUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.{InputFormat, JobConf, OutputFormat} @@ -36,7 +35,7 @@ import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.input.PortableDataStream -import org.apache.spark.rdd.{RDD, RDDMultipleTextOutputFormat} +import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils import scala.util.control.NonFatal @@ -724,43 +723,6 @@ private[spark] object PythonRDD extends Logging { converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec) } - /** - * Output a Python RDD of key-value pairs to any Hadoop file system such that the values within - * the rdd are written to sub-directories organized by the associated key. - * - * Keys and values are converted to suitable output types using either user specified converters - * or, if not specified, [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion - * types `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in - * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of - * this RDD. - */ - def saveAsHadoopFileByKey[K, V, C <: CompressionCodec]( - pyRDD: JavaRDD[Array[Byte]], - batchSerialized: Boolean, - path: String, - outputFormatClass: String, - keyClass: String, - valueClass: String, - keyConverterClass: String, - valueConverterClass: String, - confAsMap: java.util.HashMap[String, String], - compressionCodecClass: String) = { - val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized) - val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse( - inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass)) - val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration) - val codec = Option(compressionCodecClass).map(Utils.classForName(_).asInstanceOf[Class[C]]) - val converted = convertRDD(rdd, keyConverterClass, valueConverterClass, - new JavaToWritableConverter) - - converted.saveAsHadoopFile(path, - ClassUtils.primitiveToWrapper(kc), - ClassUtils.primitiveToWrapper(vc), - classOf[RDDMultipleTextOutputFormat[K,V]], - new JobConf(mergedConf), - codec=codec) - } - /** * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index 7024bbc89d803..0b9ffd58cb5b9 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -17,17 +17,11 @@ package org.apache.spark.api.python -import java.io.{BufferedReader, ByteArrayOutputStream, DataOutputStream, InputStreamReader} +import java.io.{ByteArrayOutputStream, DataOutputStream} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SharedSparkContext -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.rdd.RDDMultipleTextOutputFormat import org.scalatest.FunSuite -import scala.collection.mutable.HashSet - class PythonRDDSuite extends FunSuite with SharedSparkContext{ test("Writing large strings to the worker") { @@ -48,48 +42,4 @@ class PythonRDDSuite extends FunSuite with SharedSparkContext{ PythonRDD.writeIteratorToStream( Iterator((null, null), ("a".getBytes, null), (null, "b".getBytes)), buffer) } - - test("saveAsHadoopFileByKey should generate a text file per key") { - val testPairs : JavaRDD[Array[Byte]] = sc.parallelize( - Seq( - Array(1.toByte,1.toByte), - Array(2.toByte,4.toByte), - Array(3.toByte,9.toByte), - Array(4.toByte,16.toByte), - Array(5.toByte,25.toByte)) - ).toJavaRDD() - - val fs = FileSystem.get(new Configuration()) - val basePath = sc.conf.get("spark.local.dir", "/tmp") - val fullPath = basePath + "/testPath" - fs.delete(new Path(fullPath), true) - - PythonRDD.saveAsHadoopFileByKey( - testPairs, - false, - fullPath, - classOf[RDDMultipleTextOutputFormat[Int, Int]].toString, - classOf[Int].toString, - classOf[Int].toString, - null, - null, - new java.util.HashMap(), "") - - // Test that a file was created for each key - (1 to 5).foreach(key => { - val testPath = new Path(fullPath + "/" + key) - assert(fs.exists(testPath)) - - // Read the file and test that the contents are the values matching that key split by line - val input = fs.open(testPath) - val reader = new BufferedReader(new InputStreamReader(input)) - val values = new HashSet[Int] - val lines = Stream.continually(reader.readLine()).takeWhile(_ != null) - lines.foreach(s => values += s.toInt) - - assert(values.contains(key*key)) - }) - - fs.delete(new Path(fullPath), true) - } } From c442665a9ea62de0c96594c3af3d5d56dd025f2e Mon Sep 17 00:00:00 2001 From: Ilya Ganelin Date: Wed, 18 Mar 2015 10:36:01 -0700 Subject: [PATCH 25/25] Removing more python changes --- core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala | 2 +- .../test/scala/org/apache/spark/api/python/PythonRDDSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index f43778498a805..b885bd42916bb 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -797,7 +797,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort: /** * We try to reuse a single Socket to transfer accumulator updates, as they are all added * by the DAGScheduler's single-threaded actor anyway. - */ + */ @transient var socket: Socket = _ def openSocket(): Socket = synchronized { diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index 0b9ffd58cb5b9..b7c7195e0ca9f 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -22,7 +22,7 @@ import java.io.{ByteArrayOutputStream, DataOutputStream} import org.apache.spark.SharedSparkContext import org.scalatest.FunSuite -class PythonRDDSuite extends FunSuite with SharedSparkContext{ +class PythonRDDSuite extends FunSuite { test("Writing large strings to the worker") { val input: List[String] = List("a"*100000)