From 4ded9033f989c0b247a0b5d3b0606b2394d720ab Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Mon, 18 Apr 2016 13:46:56 +0200
Subject: [PATCH 1/4] Minor typo fixes

---
 graphx/src/main/scala/org/apache/spark/graphx/Graph.scala     | 2 +-
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 1 -
 .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala    | 2 --
 sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala    | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala    | 4 ++--
 .../apache/spark/sql/execution/datasources/DataSource.scala   | 2 +-
 .../spark/sql/execution/streaming/HDFSMetadataLog.scala       | 3 +--
 .../org/apache/spark/sql/execution/streaming/Source.scala     | 4 ++--
 .../spark/sql/execution/streaming/StreamExecution.scala       | 2 +-
 .../spark/sql/execution/streaming/StreamingRelation.scala     | 2 +-
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala  | 1 -
 .../apache/spark/streaming/dstream/PairDStreamFunctions.scala | 4 ++--
 12 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 5485e30f5a2c9..922ec7955fd6d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -365,7 +365,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    * @note By expressing computation at the edge level we achieve
    * maximum parallelism.  This is one of the core functions in the
-   * Graph API in that enables neighborhood level computation. For
+   * Graph API that enables neighborhood level computation. For
    * example this function can be used to count neighbors satisfying a
    * predicate or implement PageRank.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 71e02730c7210..71ab2025fbc16 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -438,7 +438,6 @@ class LinearRegressionModel private[ml] (
     }
   }
 
-
   override protected def predict(features: Vector): Double = {
     dot(features, coefficients) + intercept
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 6e3da6b701cb0..92bf84af27998 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -39,8 +39,6 @@ import org.apache.spark.sql.types._
 
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
- *
- * Note: Users should not implement this interface.
  */
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 @Since("1.0.0")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 1219d4d453e13..726291b96c29d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -338,7 +338,7 @@ trait Row extends Serializable {
    * Returns the index of a given field name.
    *
    * @throws UnsupportedOperationException when schema is not defined.
-   * @throws IllegalArgumentException when fieldName do not exist.
+   * @throws IllegalArgumentException when a field `name` does not exist.
    */
   def fieldIndex(name: String): Int = {
     throw new UnsupportedOperationException("fieldIndex on a Row without schema is undefined.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index b3064fd531362..4c2bf12ac9ba8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -458,8 +458,8 @@ class Dataset[T] protected[sql](
    * Returns true if this [[Dataset]] contains one or more sources that continuously
    * return data as it arrives. A [[Dataset]] that reads data from a streaming source
    * must be executed as a [[ContinuousQuery]] using the `startStream()` method in
-   * [[DataFrameWriter]].  Methods that return a single answer, (e.g., `count()` or
-   * `collect()`) will throw an [[AnalysisException]] when there is a streaming
+   * [[DataFrameWriter]]. Methods that return a single answer, e.g. `count()` or
+   * `collect()`, will throw an [[AnalysisException]] when there is a streaming
    * source present.
    *
    * @group basic
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index ef626ef5fcc7b..8e72e06b1f36f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -143,7 +143,7 @@ case class DataSource(
         caseInsensitiveOptions,
         fileCatalog.allFiles())
     }.getOrElse {
-      throw new AnalysisException("Unable to infer schema.  It must be specified manually.")
+      throw new AnalysisException("Unable to infer schema. It must be specified manually.")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index ddba3ccb1ba50..99bf20c746afe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -46,8 +46,7 @@ import org.apache.spark.sql.SparkSession
  * files in a directory always shows the latest files.
  */
 class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
-  extends MetadataLog[T]
-  with Logging {
+  extends MetadataLog[T] with Logging {
 
   import HDFSMetadataLog._
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index 6457f928ed887..1d2f7a87538f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -34,8 +34,8 @@ trait Source  {
   def getOffset: Option[Offset]
 
   /**
-   * Returns the data that is is between the offsets (`start`, `end`].  When `start` is `None` then
-   * the batch should begin with the first available record.  This method must always return the
+   * Returns the data that is between the offsets (`start`, `end`].  When `start` is `None` then
+   * the batch should begin with the first available record. This method must always return the
    * same data for a particular `start` and `end` pair.
    */
   def getBatch(start: Option[Offset], end: Offset): DataFrame
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index ea3c73d9840eb..fc18e5f065a04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -75,7 +75,7 @@ class StreamExecution(
   /** The current batchId or -1 if execution has not yet been initialized. */
   private var currentBatchId: Long = -1
 
-  /** All stream sources present the query plan. */
+  /** All stream sources present in the query plan. */
   private val sources =
     logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index 3341580fc4c49..4d65d2f4f57fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -33,7 +33,7 @@ object StreamingRelation {
  * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]]. This is only used for creating
  * a streaming [[org.apache.spark.sql.DataFrame]] from [[org.apache.spark.sql.DataFrameReader]].
  * It should be used to create [[Source]] and converted to [[StreamingExecutionRelation]] when
- * passing to [StreamExecution]] to run a query.
+ * passing to [[StreamExecution]] to run a query.
  */
 case class StreamingRelation(dataSource: DataSource, sourceName: String, output: Seq[Attribute])
   extends LeafNode {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index a8561192ed849..39277fd855875 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -44,7 +44,6 @@ import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-
 private[spark] object HiveUtils extends Logging {
 
   def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index d6ff96e1fc696..b6394e36b5152 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -418,7 +418,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
   /**
    * Return a new "state" DStream where the state for each key is updated by applying
    * the given function on the previous state of the key and the new values of the key.
-   * org.apache.spark.Partitioner is used to control the partitioning of each RDD.
+   * [[org.apache.spark.Partitioner]] is used to control the partitioning of each RDD.
    * @param updateFunc State update function. If `this` function returns None, then
    *                   corresponding state key-value pair will be eliminated.
    * @param partitioner Partitioner for controlling the partitioning of each RDD in the new
@@ -439,7 +439,7 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
   /**
    * Return a new "state" DStream where the state for each key is updated by applying
    * the given function on the previous state of the key and the new values of each key.
-   * org.apache.spark.Partitioner is used to control the partitioning of each RDD.
+   * [[org.apache.spark.Partitioner]] is used to control the partitioning of each RDD.
    * @param updateFunc State update function. Note, that this function may generate a different
    *                   tuple with a different key than the input key. Therefore keys may be removed
    *                   or added in this way. It is up to the developer to decide whether to

From 2a2896d8997fe7de3db400130c8fc8c1911825bd Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Mon, 18 Apr 2016 22:02:20 +0200
Subject: [PATCH 2/4] After code review by @srowen

---
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala | 1 +
 .../src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala  | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 71ab2025fbc16..71e02730c7210 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -438,6 +438,7 @@ class LinearRegressionModel private[ml] (
     }
   }
 
+
   override protected def predict(features: Vector): Double = {
     dot(features, coefficients) + intercept
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 92bf84af27998..6e3da6b701cb0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -39,6 +39,8 @@ import org.apache.spark.sql.types._
 
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
+ *
+ * Note: Users should not implement this interface.
  */
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 @Since("1.0.0")

From 7a402c21db9a165d941e2ae7feed6cfaf4124a28 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Thu, 21 Apr 2016 23:15:58 +0200
Subject: [PATCH 3/4] Fix the tests (= two spaces that were removed in the
 initial commit)

---
 .../apache/spark/sql/streaming/FileStreamSourceSuite.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 8ff82a3c74636..6b1ecd08c13c3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -204,7 +204,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
         createFileStreamSourceAndGetSchema(
           format = Some("parquet"), path = Some(new File(src, "1").getCanonicalPath), schema = None)
       }
-      assert("Unable to infer schema.  It must be specified manually.;" === e.getMessage)
+      assert("Unable to infer schema. It must be specified manually.;" === e.getMessage)
     }
   }
 
@@ -235,7 +235,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
         createFileStreamSourceAndGetSchema(
           format = Some("json"), path = Some(src.getCanonicalPath), schema = None)
       }
-      assert("Unable to infer schema.  It must be specified manually.;" === e.getMessage)
+      assert("Unable to infer schema. It must be specified manually.;" === e.getMessage)
     }
   }
 

From 77180c328dbde3cabd0231bcc347afb477420a00 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Tue, 26 Apr 2016 08:57:44 +0200
Subject: [PATCH 4/4] Revert line removal (per @srowen's request)

---
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 39277fd855875..a8561192ed849 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -44,6 +44,7 @@ import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
+
 private[spark] object HiveUtils extends Logging {
 
   def withHiveExternalCatalog(sc: SparkContext): SparkContext = {