apache · invkrh · Feb 27, 2014 · Feb 25, 2014 · Feb 25, 2014 · Feb 25, 2014
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # Apache Spark
 
-Lightning-Fast Cluster Computing - <http://spark.incubator.apache.org/>
+Lightning-Fast Cluster Computing - <http://spark.apache.org/>
 
 
 ## Online Documentation
 
 You can find the latest Spark documentation, including a programming
-guide, on the project webpage at <http://spark.incubator.apache.org/documentation.html>.
+guide, on the project webpage at <http://spark.apache.org/documentation.html>.
 This README file only contains basic setup instructions.
 
 
@@ -92,21 +92,10 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
 
 ## Configuration
 
-Please refer to the [Configuration guide](http://spark.incubator.apache.org/docs/latest/configuration.html)
+Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
 
 
-## Apache Incubator Notice
-
-Apache Spark is an effort undergoing incubation at The Apache Software
-Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of
-all newly accepted projects until a further review indicates that the
-infrastructure, communications, and decision making process have stabilized in
-a manner consistent with other successful ASF projects. While incubation status
-is not necessarily a reflection of the completeness or stability of the code,
-it does indicate that the project has yet to be fully endorsed by the ASF.
-
-
 ## Contributing to Spark
 
 Contributions via GitHub pull requests are gladly accepted from their original

diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -27,7 +27,7 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program.
-   * @param sc [[org.apache.spark.SparkContext]] to use for the program.
+   * @param sc org.apache.spark.SparkContext to use for the program.
    * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the
    *                 Key will be the vertex id.
    * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often
@@ -38,10 +38,10 @@ object Bagel extends Logging {
    * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices
    *                  after each superstep and provides the result to each vertex in the next
    *                  superstep.
-   * @param partitioner [[org.apache.spark.Partitioner]] partitions values by key
+   * @param partitioner org.apache.spark.Partitioner partitions values by key
    * @param numPartitions number of partitions across which to split the graph.
    *                      Default is the default parallelism of the SparkContext
-   * @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of
+   * @param storageLevel org.apache.spark.storage.StorageLevel to use for caching of
    *                    intermediate RDDs in each superstep. Defaults to caching in memory.
    * @param compute function that takes a Vertex, optional set of (possibly combined) messages to
    *                the Vertex, optional Aggregator and the current superstep,
@@ -131,7 +131,7 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default
-   * [[org.apache.spark.HashPartitioner]] and default storage level
+   * org.apache.spark.HashPartitioner and default storage level
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
     sc: SparkContext,
@@ -146,7 +146,7 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the
-   * default [[org.apache.spark.HashPartitioner]]
+   * default org.apache.spark.HashPartitioner
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
     sc: SparkContext,
@@ -166,7 +166,7 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
-   * default [[org.apache.spark.HashPartitioner]],
+   * default org.apache.spark.HashPartitioner,
    * [[org.apache.spark.bagel.DefaultCombiner]] and the default storage level
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
@@ -180,7 +180,7 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
-   * the default [[org.apache.spark.HashPartitioner]]
+   * the default org.apache.spark.HashPartitioner
    * and [[org.apache.spark.bagel.DefaultCombiner]]
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](

diff --git a/core/pom.xml b/core/pom.xml
@@ -47,16 +47,8 @@
             </exclusions>
         </dependency>
         <dependency>
-            <groupId>org.apache.avro</groupId>
-            <artifactId>avro</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.avro</groupId>
-            <artifactId>avro-ipc</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.zookeeper</groupId>
-            <artifactId>zookeeper</artifactId>
+            <groupId>org.apache.curator</groupId>
+            <artifactId>curator-recipes</artifactId>
         </dependency>
         <dependency>
             <groupId>org.eclipse.jetty</groupId>
@@ -130,8 +122,9 @@
             <artifactId>scala-library</artifactId>
         </dependency>
         <dependency>
-            <groupId>net.liftweb</groupId>
-            <artifactId>lift-json_${scala.binary.version}</artifactId>
+            <groupId>org.json4s</groupId>
+            <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
+            <version>3.2.6</version>
         </dependency>
         <dependency>
             <groupId>it.unimi.dsi</groupId>
@@ -224,7 +217,7 @@
                         </goals>
                         <configuration>
                             <exportAntProperties>true</exportAntProperties>
-                            <tasks>
+                            <target>
                                 <property name="spark.classpath" refid="maven.test.classpath" />
                                 <property environment="env" />
                                 <fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
@@ -237,7 +230,7 @@
                                         </not>
                                     </condition>
                                 </fail>
-                            </tasks>
+                            </target>
                         </configuration>
                     </execution>
                 </executions>

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -240,6 +240,7 @@ class SparkContext(
     localProperties.set(props)
   }
 
+  @deprecated("Properties no longer need to be explicitly initialized.", "1.0.0")
   def initLocalProperties() {
     localProperties.set(new Properties())
   }
@@ -308,7 +309,7 @@ class SparkContext(
   private val dagSchedulerSource = new DAGSchedulerSource(this.dagScheduler, this)
   private val blockManagerSource = new BlockManagerSource(SparkEnv.get.blockManager, this)
 
-  def initDriverMetrics() {
+  private def initDriverMetrics() {
     SparkEnv.get.metricsSystem.registerSource(dagSchedulerSource)
     SparkEnv.get.metricsSystem.registerSource(blockManagerSource)
   }
@@ -350,7 +351,7 @@ class SparkContext(
    * using the older MapReduce API (`org.apache.hadoop.mapred`).
    *
    * @param conf JobConf for setting up the dataset
-   * @param inputFormatClass Class of the [[InputFormat]]
+   * @param inputFormatClass Class of the InputFormat
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
    * @param minSplits Minimum number of Hadoop Splits to generate.

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -126,6 +126,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
   def subtract(other: JavaRDD[T], p: Partitioner): JavaRDD[T] =
     wrapRDD(rdd.subtract(other, p))
 
+  def generator: String = rdd.generator
+
   override def toString = rdd.toString
 
   /** Assign a name to this RDD */

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -74,7 +74,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * of the original partition.
    */
   def mapPartitionsWithIndex[R: ClassTag](
-      f: JFunction2[Int, java.util.Iterator[T], java.util.Iterator[R]],
+      f: JFunction2[java.lang.Integer, java.util.Iterator[T], java.util.Iterator[R]],
       preservesPartitioning: Boolean = false): JavaRDD[R] =
     new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))),
         preservesPartitioning))

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.api.java
 
+import java.util
 import java.util.{Map => JMap}
 
 import scala.collection.JavaConversions
@@ -92,6 +93,24 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
 
   private[spark] val env = sc.env
 
+  def isLocal: java.lang.Boolean = sc.isLocal
+
+  def sparkUser: String = sc.sparkUser
+
+  def master: String = sc.master
+
+  def appName: String = sc.appName
+
+  def jars: util.List[String] = sc.jars
+
+  def startTime: java.lang.Long = sc.startTime
+
+  /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
+  def defaultParallelism: java.lang.Integer = sc.defaultParallelism
+
+  /** Default min number of partitions for Hadoop RDDs when not given by user */
+  def defaultMinSplits: java.lang.Integer = sc.defaultMinSplits
+
   /** Distribute a local Scala collection to form an RDD. */
   def parallelize[T](list: java.util.List[T], numSlices: Int): JavaRDD[T] = {
     implicit val ctag: ClassTag[T] = fakeClassTag

diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
@@ -27,7 +27,8 @@ import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
 import scala.sys.process._
 
-import net.liftweb.json.JsonParser
+import org.json4s._
+import org.json4s.jackson.JsonMethods
 
 import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.deploy.master.RecoveryState
@@ -311,7 +312,7 @@ private[spark] object FaultToleranceTest extends App with Logging {
 private[spark] class TestMasterInfo(val ip: String, val dockerId: DockerId, val logFile: File)
   extends Logging  {
 
-  implicit val formats = net.liftweb.json.DefaultFormats
+  implicit val formats = org.json4s.DefaultFormats
   var state: RecoveryState.Value = _
   var liveWorkerIPs: List[String] = _
   var numLiveApps = 0
@@ -321,7 +322,7 @@ private[spark] class TestMasterInfo(val ip: String, val dockerId: DockerId, val
   def readState() {
     try {
       val masterStream = new InputStreamReader(new URL("http://%s:8080/json".format(ip)).openStream)
-      val json = JsonParser.parse(masterStream, closeAutomatically = true)
+      val json = JsonMethods.parse(masterStream)
 
       val workers = json \ "workers"
       val liveWorkers = workers.children.filter(w => (w \ "state").extract[String] == "ALIVE")
@@ -349,7 +350,7 @@ private[spark] class TestMasterInfo(val ip: String, val dockerId: DockerId, val
 private[spark] class TestWorkerInfo(val ip: String, val dockerId: DockerId, val logFile: File)
   extends Logging {
 
-  implicit val formats = net.liftweb.json.DefaultFormats
+  implicit val formats = org.json4s.DefaultFormats
 
   logDebug("Created worker: " + this)
 

diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy
 
-import net.liftweb.json.JsonDSL._
+import org.json4s.JsonDSL._
 
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse}
 import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo}

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala b/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala
@@ -30,6 +30,7 @@ import org.apache.spark.deploy.master.MasterMessages.ElectedLeader
  * [[org.apache.spark.deploy.master.MasterMessages.RevokedLeadership RevokedLeadership]]
  */
 private[spark] trait LeaderElectionAgent extends Actor {
+  //TODO: LeaderElectionAgent does not necessary to be an Actor anymore, need refactoring.
   val masterActor: ActorRef
 }
 

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala
@@ -28,10 +28,6 @@ private[master] object MasterMessages {
 
   case object RevokedLeadership
 
-  // Actor System to LeaderElectionAgent
-
-  case object CheckLeader
-
   // Actor System to Master
 
   case object CheckForWorkerTimeOut

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/SparkCuratorUtil.scala b/core/src/main/scala/org/apache/spark/deploy/master/SparkCuratorUtil.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.master
+
+import org.apache.spark.{SparkConf, Logging}
+import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
+import org.apache.curator.retry.ExponentialBackoffRetry
+import org.apache.zookeeper.KeeperException
+
+
+object SparkCuratorUtil extends Logging {
+
+  val ZK_CONNECTION_TIMEOUT_MILLIS = 15000
+  val ZK_SESSION_TIMEOUT_MILLIS = 60000
+  val RETRY_WAIT_MILLIS = 5000
+  val MAX_RECONNECT_ATTEMPTS = 3
+
+  def newClient(conf: SparkConf): CuratorFramework = {
+    val ZK_URL = conf.get("spark.deploy.zookeeper.url")
+    val zk = CuratorFrameworkFactory.newClient(ZK_URL,
+      ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS,
+      new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS))
+    zk.start()
+    zk
+  }
+
+  def mkdir(zk: CuratorFramework, path: String) {
+    if (zk.checkExists().forPath(path) == null) {
+      try {
+        zk.create().creatingParentsIfNeeded().forPath(path)
+      } catch {
+        case nodeExist: KeeperException.NodeExistsException =>
+          // do nothing, ignore node existing exception.
+        case e: Exception => throw e
+      }
+    }
+  }
+}