Merge branch 'master' of github.com:apache/spark into configure-ports

apache · Aug 6, 2014 · 621267b · 621267b
2 parents 8a6b820 + 82624e2
commit 621267b
Show file tree

Hide file tree

Showing 22 changed files with 377 additions and 174 deletions.
diff --git a/core/pom.xml b/core/pom.xml
@@ -150,7 +150,7 @@
     <dependency>
       <groupId>org.json4s</groupId>
       <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
-      <version>3.2.6</version>
+      <version>3.2.10</version>
     </dependency>
     <dependency>
       <groupId>colt</groupId>

diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -96,17 +96,23 @@ class JdbcRDD[T: ClassTag](
 
     override def close() {
       try {
-        if (null != rs && ! rs.isClosed()) rs.close()
+        if (null != rs && ! rs.isClosed()) {
+          rs.close()
+        }
       } catch {
         case e: Exception => logWarning("Exception closing resultset", e)
       }
       try {
-        if (null != stmt && ! stmt.isClosed()) stmt.close()
+        if (null != stmt && ! stmt.isClosed()) {
+          stmt.close()
+        }
       } catch {
         case e: Exception => logWarning("Exception closing statement", e)
       }
       try {
-        if (null != conn && ! stmt.isClosed()) conn.close()
+        if (null != conn && ! conn.isClosed()) {
+          conn.close()
+        }
         logInfo("closed connection")
       } catch {
         case e: Exception => logWarning("Exception closing connection", e)
@@ -120,3 +126,4 @@ object JdbcRDD {
     Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
   }
 }
+
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -40,7 +40,7 @@ private[spark] class SortShuffleWriter[K, V, C](
   private val ser = Serializer.getSerializer(dep.serializer.orNull)
 
   private val conf = SparkEnv.get.conf
-  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
 
   private var sorter: ExternalSorter[K, V, _] = null
   private var outputFile: File = null

diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
@@ -73,7 +73,7 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   val sortBasedShuffle =
     conf.get("spark.shuffle.manager", "") == classOf[SortShuffleManager].getName
 
-  private val bufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+  private val bufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
 
   /**
    * Contains all the state related to a particular shuffle. This includes a pool of unused

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -101,7 +101,7 @@ class ExternalAppendOnlyMap[K, V, C](
   private var _memoryBytesSpilled = 0L
   private var _diskBytesSpilled = 0L
 
-  private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+  private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
   private val keyComparator = new HashComparator[K]
   private val ser = serializer.newInstance()
 

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -84,7 +84,7 @@ private[spark] class ExternalSorter[K, V, C](
 
   private val conf = SparkEnv.get.conf
   private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
-  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
 
   // Size of object batches when reading/writing from serializers.
   //

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -266,7 +266,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.shuffle.file.buffer.kb</code></td>
-  <td>100</td>
+  <td>32</td>
   <td>
     Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers
     reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -271,6 +271,7 @@ class PythonMLLibAPI extends Serializable {
       .setNumIterations(numIterations)
       .setRegParam(regParam)
       .setStepSize(stepSize)
+      .setMiniBatchFraction(miniBatchFraction)
     if (regType == "l2") {
       lrAlg.optimizer.setUpdater(new SquaredL2Updater)
     } else if (regType == "l1") {
@@ -341,16 +342,27 @@ class PythonMLLibAPI extends Serializable {
       stepSize: Double,
       regParam: Double,
       miniBatchFraction: Double,
-      initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = {
+      initialWeightsBA: Array[Byte],
+      regType: String,
+      intercept: Boolean): java.util.List[java.lang.Object] = {
+    val SVMAlg = new SVMWithSGD()
+    SVMAlg.setIntercept(intercept)
+    SVMAlg.optimizer
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setStepSize(stepSize)
+      .setMiniBatchFraction(miniBatchFraction)
+    if (regType == "l2") {
+      SVMAlg.optimizer.setUpdater(new SquaredL2Updater)
+    } else if (regType == "l1") {
+      SVMAlg.optimizer.setUpdater(new L1Updater)
+    } else if (regType != "none") {
+      throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter."
+        + " Can only be initialized using the following string values: [l1, l2, none].")
+    }
     trainRegressionModel(
       (data, initialWeights) =>
-        SVMWithSGD.train(
-          data,
-          numIterations,
-          stepSize,
-          regParam,
-          miniBatchFraction,
-          initialWeights),
+        SVMAlg.run(data, initialWeights),
       dataBytesJRDD,
       initialWeightsBA)
   }
@@ -363,15 +375,28 @@ class PythonMLLibAPI extends Serializable {
       numIterations: Int,
       stepSize: Double,
       miniBatchFraction: Double,
-      initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = {
+      initialWeightsBA: Array[Byte],
+      regParam: Double,
+      regType: String,
+      intercept: Boolean): java.util.List[java.lang.Object] = {
+    val LogRegAlg = new LogisticRegressionWithSGD()
+    LogRegAlg.setIntercept(intercept)
+    LogRegAlg.optimizer
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setStepSize(stepSize)
+      .setMiniBatchFraction(miniBatchFraction)
+    if (regType == "l2") {
+      LogRegAlg.optimizer.setUpdater(new SquaredL2Updater)
+    } else if (regType == "l1") {
+      LogRegAlg.optimizer.setUpdater(new L1Updater)
+    } else if (regType != "none") {
+      throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter."
+        + " Can only be initialized using the following string values: [l1, l2, none].")
+    }
     trainRegressionModel(
       (data, initialWeights) =>
-        LogisticRegressionWithSGD.train(
-          data,
-          numIterations,
-          stepSize,
-          miniBatchFraction,
-          initialWeights),
+        LogRegAlg.run(data, initialWeights),
       dataBytesJRDD,
       initialWeightsBA)
   }