From 4fa23b4fb74e252bc9b8cd9cda0f1453752639bd Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 12:02:43 -0700
Subject: [PATCH 01/10] add scala example and fix error prompt in
 include_example

---
 docs/_plugins/include_example.rb              |   8 +-
 docs/mllib-data-types.md                      |  96 +---------
 .../examples/mllib/DataTypesExamples.scala    | 179 ++++++++++++++++++
 3 files changed, 191 insertions(+), 92 deletions(-)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index 306888801df21..c90d73f024744 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -85,10 +85,10 @@ def select_lines(code)
         .select { |l, i| l.include? "$example off#{@snippet_label}$" }
         .map { |l, i| i }
 
-      raise "Start indices amount is not equal to end indices amount, see #{@file}." \
+      raise "Start indices amount is not equal to end indices amount, see #{@file}, #{@snippet_label}." \
         unless startIndices.size == endIndices.size
 
-      raise "No code is selected by include_example, see #{@file}." \
+      raise "No code is selected by include_example, see #{@file}, #{@snippet_label}." \
         if startIndices.size == 0
 
       # Select and join code blocks together, with a space line between each of two continuous
@@ -96,9 +96,9 @@ def select_lines(code)
       lastIndex = -1
       result = ""
       startIndices.zip(endIndices).each do |start, endline|
-        raise "Overlapping between two example code blocks are not allowed, see #{@file}." \
+        raise "Overlapping between two example code blocks are not allowed, see #{@file}, #{@snippet_label}." \
             if start <= lastIndex
-        raise "$example on$ should not be in the same line with $example off$, see #{@file}." \
+        raise "$example on$ should not be in the same line with $example off$, see #{@file}, #{@snippet_label}." \
             if start == endline
         lastIndex = endline
         range = Range.new(start + 1, endline - 1)
diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index ef56aebbc3608..3f9c092fc814b 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -35,16 +35,7 @@ using the factory methods implemented in
 
 Refer to the [`Vector` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-
-// Create a dense vector (1.0, 0.0, 3.0).
-val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
-// Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries.
-val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))
-// Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries.
-val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
-{% endhighlight %}
+{% include_example local_vector scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
 ***Note:***
 Scala imports `scala.collection.immutable.Vector` by default, so you have to import
@@ -127,16 +118,8 @@ A labeled point is represented by the case class
 
 Refer to the [`LabeledPoint` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
+{% include_example labeled_point scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-// Create a labeled point with a positive label and a dense feature vector.
-val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
-
-// Create a labeled point with a negative label and a sparse feature vector.
-val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -201,13 +184,8 @@ examples stored in LIBSVM format.
 
 Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
+{% include_example libsvm scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -266,15 +244,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order.
 
 Refer to the [`Matrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix) and [`Matrices` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.{Matrix, Matrices}
+{% include_example local_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
-
-// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
-val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -369,21 +340,8 @@ For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_
 
 Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
-
-val rows: RDD[Vector] = ... // an RDD of local vectors
-// Create a RowMatrix from an RDD[Vector].
-val mat: RowMatrix = new RowMatrix(rows)
+{% include_example row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-// Get its size.
-val m = mat.numRows()
-val n = mat.numCols()
-
-// QR decomposition 
-val qrResult = mat.tallSkinnyQR(true)
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -456,20 +414,8 @@ its row indices.
 
 Refer to the [`IndexedRowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
+{% include_example indexed_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-val rows: RDD[IndexedRow] = ... // an RDD of indexed rows
-// Create an IndexedRowMatrix from an RDD[IndexedRow].
-val mat: IndexedRowMatrix = new IndexedRowMatrix(rows)
-
-// Get its size.
-val m = mat.numRows()
-val n = mat.numCols()
-
-// Drop its row indices.
-val rowMat: RowMatrix = mat.toRowMatrix()
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -562,20 +508,8 @@ with sparse rows by calling `toIndexedRowMatrix`.  Other computations for
 
 Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
+{% include_example coordinate_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-val entries: RDD[MatrixEntry] = ... // an RDD of matrix entries
-// Create a CoordinateMatrix from an RDD[MatrixEntry].
-val mat: CoordinateMatrix = new CoordinateMatrix(entries)
-
-// Get its size.
-val m = mat.numRows()
-val n = mat.numCols()
-
-// Convert it to an IndexRowMatrix whose rows are sparse vectors.
-val indexedRowMatrix = mat.toIndexedRowMatrix()
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -670,22 +604,8 @@ Users may change the block size by supplying the values through `toBlockMatrix(r
 
 Refer to the [`BlockMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
-
-val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
-// Create a CoordinateMatrix from an RDD[MatrixEntry].
-val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
-// Transform the CoordinateMatrix to a BlockMatrix
-val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate()
+{% include_example block_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
-// Calculate A^T A.
-val ata = matA.transpose.multiply(matA)
-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
new file mode 100644
index 0000000000000..223aa93b3f2c1
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.linalg.{Matrices, Matrix}
+// $example on:local-vector$
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+// $example off:local-vector$
+import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
+import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+
+
+object DataTypesExamples {
+
+  def localVectorExample(): Unit = {
+    // $example on:local-vector$
+    // Create a dense vector (1.0, 0.0, 3.0).
+    val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
+    // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to
+    // nonzero entries.
+    val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))
+    // Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries.
+    val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
+    // $example off:local-vector$
+  }
+
+  def labeledPointExample(): Unit = {
+    // $example on:labeled-point$
+    // Create a labeled point with a positive label and a dense feature vector.
+    val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
+
+    // Create a labeled point with a negative label and a sparse feature vector.
+    val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
+    // $example off:labeled-point$
+  }
+
+  def libsvmExample(): Unit = {
+    val sc = SparkContext.getOrCreate()
+    // $example on:libsvm$
+    val examples: RDD[LabeledPoint] =
+      MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    // $example off:libsvm$
+  }
+
+  def localMatrixExample(): Unit = {
+    // $example on:local-matrix$
+    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
+
+    // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+    val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
+    // $example off:local-matrix$
+  }
+
+  def rowMatrixExample(): Unit = {
+    val sc = SparkContext.getOrCreate()
+    // $example on:row-matrix$
+    val v1 = Vectors.dense(1.0, 10.0, 100.0)
+    val v2 = Vectors.dense(2.0, 20.0, 200.0)
+    val v3 = Vectors.dense(3.0, 30.0, 300.0)
+
+    val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors
+    // Create a RowMatrix from an RDD[Vector].
+    val mat: RowMatrix = new RowMatrix(rows)
+
+    // Get its size.
+    val m = mat.numRows()
+    val n = mat.numCols()
+
+    // QR decomposition
+    val qrResult = mat.tallSkinnyQR(true)
+    // $example off:row-matrix$
+  }
+
+  def indexedRowMatrixExample(): Unit = {
+    val sc = SparkContext.getOrCreate()
+
+    // $example on:indexed-row-matrix$
+    val r0 = IndexedRow(0, Vectors.dense(1, 2, 3))
+    val r1 = IndexedRow(1, Vectors.dense(4, 5, 6))
+    val r2 = IndexedRow(2, Vectors.dense(7, 8, 9))
+    val r3 = IndexedRow(3, Vectors.dense(10, 11, 12))
+
+    val rows: RDD[IndexedRow] = sc.parallelize(Seq(r0, r1, r2, r3)) // an RDD of indexed rows
+    // Create an IndexedRowMatrix from an RDD[IndexedRow].
+    val mat: IndexedRowMatrix = new IndexedRowMatrix(rows)
+
+    // Get its size.
+    val m = mat.numRows()
+    val n = mat.numCols()
+
+    // Drop its row indices.
+    val rowMat: RowMatrix = mat.toRowMatrix()
+    // $example off:indexed-row-matrix$
+  }
+
+  def coordinateMatrixExample(): Unit = {
+    val sc = SparkContext.getOrCreate()
+
+    // $example on:coordinate-row-matrix$
+    val me1 = MatrixEntry(0, 0, 1.2)
+    val me2 = MatrixEntry(1, 0, 2.1)
+    val me3 = MatrixEntry(6, 1, 3.7)
+
+    val entries: RDD[MatrixEntry] = sc.parallelize(Seq(me1, me2, me3)) // an RDD of matrix entries
+    // Create a CoordinateMatrix from an RDD[MatrixEntry].
+    val mat: CoordinateMatrix = new CoordinateMatrix(entries)
+
+    // Get its size.
+    val m = mat.numRows()
+    val n = mat.numCols()
+
+    // Convert it to an IndexRowMatrix whose rows are sparse vectors.
+    val indexedRowMatrix = mat.toIndexedRowMatrix()
+    // $example off:coordinate-row-matrix$
+  }
+
+  def blockMatrixExample(): Unit = {
+    val sc = SparkContext.getOrCreate()
+
+    // $example on:block-matrix$
+    val me1 = MatrixEntry(0, 0, 1.2)
+    val me2 = MatrixEntry(1, 0, 2.1)
+    val me3 = MatrixEntry(6, 1, 3.7)
+
+    // an RDD of (i, j, v) matrix entries
+    val entries: RDD[MatrixEntry] = sc.parallelize(Seq(me1, me2, me3))
+    // Create a CoordinateMatrix from an RDD[MatrixEntry].
+    val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
+    // Transform the CoordinateMatrix to a BlockMatrix
+    val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
+
+    // Validate whether the BlockMatrix is set up properly.
+    // Throws an Exception when it is not valid.
+    // Nothing happens if it is valid.
+    matA.validate()
+
+    // Calculate A^T A.
+    val ata = matA.transpose.multiply(matA)
+    // $example off:block-matrix$
+  }
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("DataTypeExamples")
+    val sc = new SparkContext(conf)
+
+    localVectorExample()
+    labeledPointExample()
+    libsvmExample()
+    localMatrixExample()
+    rowMatrixExample()
+    indexedRowMatrixExample()
+    coordinateMatrixExample()
+    blockMatrixExample()
+
+    sc.stop()
+  }
+}
+// scalastyle:on println

From 5ce1ef1948434162961139222d23ff3bd7fa2b8b Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 14:40:58 -0700
Subject: [PATCH 02/10] change md file

---
 docs/mllib-data-types.md | 247 +++------------------------------------
 1 file changed, 17 insertions(+), 230 deletions(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 3f9c092fc814b..2ad38dc85ea40 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -54,15 +54,8 @@ using the factory methods implemented in
 
 Refer to the [`Vector` Java docs](api/java/org/apache/spark/mllib/linalg/Vector.html) and [`Vectors` Java docs](api/java/org/apache/spark/mllib/linalg/Vectors.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
+{% include_example local_vector java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-// Create a dense vector (1.0, 0.0, 3.0).
-Vector dv = Vectors.dense(1.0, 0.0, 3.0);
-// Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries.
-Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0});
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -83,20 +76,7 @@ in [`Vectors`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) to cr
 
 Refer to the [`Vectors` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) for more details on the API.
 
-{% highlight python %}
-import numpy as np
-import scipy.sparse as sps
-from pyspark.mllib.linalg import Vectors
-
-# Use a NumPy array as a dense vector.
-dv1 = np.array([1.0, 0.0, 3.0])
-# Use a Python list as a dense vector.
-dv2 = [1.0, 0.0, 3.0]
-# Create a SparseVector.
-sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
-# Use a single-column SciPy csc_matrix as a sparse vector.
-sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape = (3, 1))
-{% endhighlight %}
+{% include_example local_vector python/mllib/datatypes_examples.py %}
 
 </div>
 </div>
@@ -129,16 +109,8 @@ A labeled point is represented by
 
 Refer to the [`LabeledPoint` Java docs](api/java/org/apache/spark/mllib/regression/LabeledPoint.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-
-// Create a labeled point with a positive label and a dense feature vector.
-LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
+{% include_example labeled_point java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-// Create a labeled point with a negative label and a sparse feature vector.
-LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -148,16 +120,8 @@ A labeled point is represented by
 
 Refer to the [`LabeledPoint` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg import SparseVector
-from pyspark.mllib.regression import LabeledPoint
-
-# Create a labeled point with a positive label and a dense feature vector.
-pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
+{% include_example labeled_point python/mllib/datatypes_examples.py %}
 
-# Create a labeled point with a negative label and a sparse feature vector.
-neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))
-{% endhighlight %}
 </div>
 </div>
 
@@ -194,14 +158,8 @@ examples stored in LIBSVM format.
 
 Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.api.java.JavaRDD;
+{% include_example libsvm java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-JavaRDD<LabeledPoint> examples = 
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -210,11 +168,8 @@ examples stored in LIBSVM format.
 
 Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.util import MLUtils
+{% include_example libsvm python/mllib/datatypes_examples.py %}
 
-examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-{% endhighlight %}
 </div>
 </div>
 
@@ -260,16 +215,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order.
 
 Refer to the [`Matrix` Java docs](api/java/org/apache/spark/mllib/linalg/Matrix.html) and [`Matrices` Java docs](api/java/org/apache/spark/mllib/linalg/Matrices.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.Matrices;
+{% include_example local_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-Matrix dm = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
-
-// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
-Matrix sm = Matrices.sparse(3, 2, new int[] {0, 1, 3}, new int[] {0, 2, 1}, new double[] {9, 6, 8});
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -284,15 +231,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order.
 
 Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix) and [`Matrices` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg import Matrix, Matrices
-
-# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+{% include_example local_matrix python/mllib/datatypes_examples.py %}
 
-# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
-sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
-{% endhighlight %}
 </div>
 
 </div>
@@ -351,22 +291,8 @@ created from a `JavaRDD<Vector>` instance.  Then we can compute its column summa
 
 Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+{% include_example row_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-JavaRDD<Vector> rows = ... // a JavaRDD of local vectors
-// Create a RowMatrix from an JavaRDD<Vector>.
-RowMatrix mat = new RowMatrix(rows.rdd());
-
-// Get its size.
-long m = mat.numRows();
-long n = mat.numCols();
-
-// QR decomposition 
-QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -376,24 +302,9 @@ created from an `RDD` of vectors.
 
 Refer to the [`RowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg.distributed import RowMatrix
-
-# Create an RDD of vectors.
-rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
-
-# Create a RowMatrix from an RDD of vectors.
-mat = RowMatrix(rows)
+{% include_example row_matrix python/mllib/datatypes_examples.py %}
 
-# Get its size.
-m = mat.numRows()  # 4
-n = mat.numCols()  # 3
-
-# Get the rows as an RDD of vectors again.
-rowsRDD = mat.rows
-{% endhighlight %}
 </div>
-
 </div>
 
 ### IndexedRowMatrix
@@ -429,23 +340,8 @@ its row indices.
 
 Refer to the [`IndexedRowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.distributed.IndexedRow;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-
-JavaRDD<IndexedRow> rows = ... // a JavaRDD of indexed rows
-// Create an IndexedRowMatrix from a JavaRDD<IndexedRow>.
-IndexedRowMatrix mat = new IndexedRowMatrix(rows.rdd());
+{% include_example indexed_row_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-// Get its size.
-long m = mat.numRows();
-long n = mat.numCols();
-
-// Drop its row indices.
-RowMatrix rowMat = mat.toRowMatrix();
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -458,34 +354,9 @@ its row indices.
 
 Refer to the [`IndexedRowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
-
-# Create an RDD of indexed rows.
-#   - This can be done explicitly with the IndexedRow class:
-indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), 
-                              IndexedRow(1, [4, 5, 6]), 
-                              IndexedRow(2, [7, 8, 9]), 
-                              IndexedRow(3, [10, 11, 12])])
-#   - or by using (long, vector) tuples:
-indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), 
-                              (2, [7, 8, 9]), (3, [10, 11, 12])])
+{% include_example indexed_row_matrix python/mllib/datatypes_examples.py %}
 
-# Create an IndexedRowMatrix from an RDD of IndexedRows.
-mat = IndexedRowMatrix(indexedRows)
-
-# Get its size.
-m = mat.numRows()  # 4
-n = mat.numCols()  # 3
-
-# Get the rows as an RDD of IndexedRows.
-rowsRDD = mat.rows
-
-# Convert to a RowMatrix by dropping the row indices.
-rowMat = mat.toRowMatrix()
-{% endhighlight %}
 </div>
-
 </div>
 
 ### CoordinateMatrix
@@ -508,7 +379,7 @@ with sparse rows by calling `toIndexedRowMatrix`.  Other computations for
 
 Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API.
 
-{% include_example coordinate_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
+{% include_example coordinate_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %}
 
 </div>
 
@@ -524,23 +395,8 @@ with sparse rows by calling `toIndexedRowMatrix`. Other computations for
 
 Refer to the [`CoordinateMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
+{% include_example coordinate_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-JavaRDD<MatrixEntry> entries = ... // a JavaRDD of matrix entries
-// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
-CoordinateMatrix mat = new CoordinateMatrix(entries.rdd());
-
-// Get its size.
-long m = mat.numRows();
-long n = mat.numCols();
-
-// Convert it to an IndexRowMatrix whose rows are sparse vectors.
-IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix();
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -553,36 +409,9 @@ calling `toRowMatrix`, or to an `IndexedRowMatrix` with sparse rows by calling `
 
 Refer to the [`CoordinateMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.CoordinateMatrix) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
-
-# Create an RDD of coordinate entries.
-#   - This can be done explicitly with the MatrixEntry class:
-entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
-#   - or using (long, long, float) tuples:
-entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)])
-
-# Create an CoordinateMatrix from an RDD of MatrixEntries.
-mat = CoordinateMatrix(entries)
-
-# Get its size.
-m = mat.numRows()  # 3
-n = mat.numCols()  # 2
+{% include_example coordinate_matrix python/mllib/datatypes_examples.py %}
 
-# Get the entries as an RDD of MatrixEntries.
-entriesRDD = mat.entries
-
-# Convert to a RowMatrix.
-rowMat = mat.toRowMatrix()
-
-# Convert to an IndexedRowMatrix.
-indexedRowMat = mat.toIndexedRowMatrix()
-
-# Convert to a BlockMatrix.
-blockMat = mat.toBlockMatrix()
-{% endhighlight %}
 </div>
-
 </div>
 
 ### BlockMatrix
@@ -617,25 +446,8 @@ Users may change the block size by supplying the values through `toBlockMatrix(r
 
 Refer to the [`BlockMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+{% include_example block_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %}
 
-JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
-// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
-CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
-// Transform the CoordinateMatrix to a BlockMatrix
-BlockMatrix matA = coordMat.toBlockMatrix().cache();
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate();
-
-// Calculate A^T A.
-BlockMatrix ata = matA.transpose().multiply(matA);
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -646,32 +458,7 @@ can be created from an `RDD` of sub-matrix blocks, where a sub-matrix block is a
 
 Refer to the [`BlockMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.BlockMatrix) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg import Matrices
-from pyspark.mllib.linalg.distributed import BlockMatrix
-
-# Create an RDD of sub-matrix blocks.
-blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), 
-                         ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
-
-# Create a BlockMatrix from an RDD of sub-matrix blocks.
-mat = BlockMatrix(blocks, 3, 2)
-
-# Get its size.
-m = mat.numRows() # 6
-n = mat.numCols() # 2
-
-# Get the blocks as an RDD of sub-matrix blocks.
-blocksRDD = mat.blocks
-
-# Convert to a LocalMatrix.
-localMat = mat.toLocalMatrix()
-
-# Convert to an IndexedRowMatrix.
-indexedRowMat = mat.toIndexedRowMatrix()
+{% include_example block_matrix python/mllib/datatypes_examples.py %}
 
-# Convert to a CoordinateMatrix.
-coordinateMat = mat.toCoordinateMatrix()
-{% endhighlight %}
 </div>
 </div>

From 922ba78801549a801d3f9567bbb065e6ba7fd0d5 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 14:41:31 -0700
Subject: [PATCH 03/10] refine modifier

---
 .../examples/mllib/JavaDataTypesExamples.java | 188 +++++++++++++++++
 .../main/python/mllib/datatypes_examples.py   | 197 ++++++++++++++++++
 .../examples/mllib/DataTypesExamples.scala    |  48 ++---
 3 files changed, 409 insertions(+), 24 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
 create mode 100644 examples/src/main/python/mllib/datatypes_examples.py

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
new file mode 100644
index 0000000000000..a72a3a41798fa
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.linalg.distributed.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+
+
+public class JavaDataTypesExamples {
+
+  private static void localVectorExample() {
+    // $example on:local_vector$
+    // Create a dense vector (1.0, 0.0, 3.0).
+    Vector dv = Vectors.dense(1.0, 0.0, 3.0);
+    // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to
+    // nonzero entries.
+    Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0});
+    // $example off:local_vector$
+  }
+
+  private static void labeledPointExample() {
+    // $example on:labeled_point$
+    // Create a labeled point with a positive label and a dense feature vector.
+    LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
+
+    // Create a labeled point with a negative label and a sparse feature vector.
+    LabeledPoint neg =
+      new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
+    // $example off:labeled_point$
+  }
+
+  private static void libsvmExample() {
+    // $example on:libsvm$
+    SparkContext sc = SparkContext.getOrCreate();
+    JavaRDD<LabeledPoint> examples =
+      MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+    // $example off:libsvm$
+  }
+
+  private static void localMatrixExample() {
+    // $example on:local_matrix$
+    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    Matrix dm = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
+
+    // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+    Matrix sm =
+      Matrices.sparse(3, 2, new int[] {0, 1, 3}, new int[] {0, 2, 1}, new double[] {9, 6, 8});
+    // $example off:local_matrix$
+  }
+
+  private static void rowMatrixExample() {
+    SparkContext sc = SparkContext.getOrCreate();
+    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
+
+    // $example on:row_matrix$
+    Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
+    Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
+    Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
+
+    // a JavaRDD of local vectors
+    JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3));
+
+    // Create a RowMatrix from an JavaRDD<Vector>.
+    RowMatrix mat = new RowMatrix(rows.rdd());
+
+    // Get its size.
+    long m = mat.numRows();
+    long n = mat.numCols();
+
+    // QR decomposition
+    QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
+    // $example off:row_matrix$
+  }
+
+  private static void indexedRowMatrixExample() {
+    SparkContext sc = SparkContext.getOrCreate();
+    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
+
+    // $example on:indexed_row_matrix$
+    IndexedRow r0 = new IndexedRow(0, Vectors.dense(1, 2, 3));
+    IndexedRow r1 = new IndexedRow(1, Vectors.dense(4, 5, 6));
+    IndexedRow r2 = new IndexedRow(2, Vectors.dense(7, 8, 9));
+    IndexedRow r3 = new IndexedRow(3, Vectors.dense(10, 11, 12));
+
+    // a JavaRDD of indexed rows
+    JavaRDD<IndexedRow> rows = jsc.parallelize(Arrays.asList(r0, r1, r2, r3));
+
+    // Create an IndexedRowMatrix from a JavaRDD<IndexedRow>.
+    IndexedRowMatrix mat = new IndexedRowMatrix(rows.rdd());
+
+    // Get its size.
+    long m = mat.numRows();
+    long n = mat.numCols();
+
+    // Drop its row indices.
+    RowMatrix rowMat = mat.toRowMatrix();
+    // $example off:indexed_row_matrix$
+  }
+
+  private static void coordinateMatrixExample() {
+    SparkContext sc = SparkContext.getOrCreate();
+    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
+
+    // $example on:coordinate_matrix$
+    MatrixEntry me1 = new MatrixEntry(0, 0, 1.2);
+    MatrixEntry me2 = new MatrixEntry(1, 0, 2.1);
+    MatrixEntry me3 = new MatrixEntry(6, 1, 3.7);
+
+    // a JavaRDD of matrix entries
+    JavaRDD<MatrixEntry> entries = jsc.parallelize(Arrays.asList(me1, me2, me3));
+    // Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
+    CoordinateMatrix mat = new CoordinateMatrix(entries.rdd());
+
+    // Get its size.
+    long m = mat.numRows();
+    long n = mat.numCols();
+
+    // Convert it to an IndexRowMatrix whose rows are sparse vectors.
+    IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix();
+    // $example off:coordinate_matrix$
+  }
+
+  private static void blockMatrixExample() {
+    SparkContext sc = SparkContext.getOrCreate();
+    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
+
+    // $example on:block_matrix$
+    MatrixEntry me1 = new MatrixEntry(0, 0, 1.2);
+    MatrixEntry me2 = new MatrixEntry(1, 0, 2.1);
+    MatrixEntry me3 = new MatrixEntry(6, 1, 3.7);
+
+    // a JavaRDD of (i, j, v) Matrix Entries
+    JavaRDD<MatrixEntry> entries = jsc.parallelize(Arrays.asList(me1, me2, me3));
+
+    // Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
+    CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
+    // Transform the CoordinateMatrix to a BlockMatrix
+    BlockMatrix matA = coordMat.toBlockMatrix().cache();
+
+    // Validate whether the BlockMatrix is set up properly.
+    // Throws an Exception when it is not valid. Nothing happens if it is valid.
+    matA.validate();
+
+    // Calculate A^T A.
+    BlockMatrix ata = matA.transpose().multiply(matA);
+    // $example off:block_matrix$
+  }
+
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaDataTypesExample");
+    SparkContext sc = new SparkContext(conf);
+
+    localVectorExample();
+    labeledPointExample();
+    libsvmExample();
+    localMatrixExample();
+    rowMatrixExample();
+    indexedRowMatrixExample();
+    coordinateMatrixExample();
+    blockMatrixExample();
+
+    sc.stop();
+ }
+}
diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py
new file mode 100644
index 0000000000000..f9e8adf8a9b44
--- /dev/null
+++ b/examples/src/main/python/mllib/datatypes_examples.py
@@ -0,0 +1,197 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from numpy import array
+
+from pyspark import SparkContext
+from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
+
+
+def __local_vector_example():
+    # $example on:local_vector$
+    import numpy as np
+    import scipy.sparse as sps
+    from pyspark.mllib.linalg import Vectors
+
+    # Use a NumPy array as a dense vector.
+    dv1 = np.array([1.0, 0.0, 3.0])
+    # Use a Python list as a dense vector.
+    dv2 = [1.0, 0.0, 3.0]
+    # Create a SparseVector.
+    sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
+    # Use a single-column SciPy csc_matrix as a sparse vector.
+    sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))
+    # $example off:local_vector$
+
+
+def __labeled_point_example():
+    # $example on:labeled_point$
+    from pyspark.mllib.linalg import SparseVector
+    from pyspark.mllib.regression import LabeledPoint
+
+    # Create a labeled point with a positive label and a dense feature vector.
+    pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
+
+    # Create a labeled point with a negative label and a sparse feature vector.
+    neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))
+    # $example off:labeled_point$
+
+
+def __libsvm_example():
+    # $example on:libsvm$
+    from pyspark.mllib.util import MLUtils
+
+    examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    # $example off:libsvm$
+
+
+def __local_matrix_example():
+    # $example on:local_matrix$
+    from pyspark.mllib.linalg import Matrix, Matrices
+
+    # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+
+    # Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+    sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
+    # $example off:local_matrix$
+
+
+def __row_matrix_example():
+    # $example on:row_matrix$
+    from pyspark.mllib.linalg.distributed import RowMatrix
+
+    # Create an RDD of vectors.
+    rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+
+    # Create a RowMatrix from an RDD of vectors.
+    mat = RowMatrix(rows)
+
+    # Get its size.
+    m = mat.numRows()  # 4
+    n = mat.numCols()  # 3
+
+    # Get the rows as an RDD of vectors again.
+    rowsRDD = mat.rows
+    # $example off:row_matrix$
+
+
+def __indexed_row_matrix_example():
+    # $example on:indexed_row_matrix$
+    from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
+
+    # Create an RDD of indexed rows.
+    #   - This can be done explicitly with the IndexedRow class:
+    indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
+                                  IndexedRow(1, [4, 5, 6]),
+                                  IndexedRow(2, [7, 8, 9]),
+                                  IndexedRow(3, [10, 11, 12])])
+    #   - or by using (long, vector) tuples:
+    indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]),
+                                  (2, [7, 8, 9]), (3, [10, 11, 12])])
+
+    # Create an IndexedRowMatrix from an RDD of IndexedRows.
+    mat = IndexedRowMatrix(indexedRows)
+
+    # Get its size.
+    m = mat.numRows()  # 4
+    n = mat.numCols()  # 3
+
+    # Get the rows as an RDD of IndexedRows.
+    rowsRDD = mat.rows
+
+    # Convert to a RowMatrix by dropping the row indices.
+    rowMat = mat.toRowMatrix()
+    # $example off:indexed_row_matrix$
+
+
+def __coordinate_matrix_example():
+    # $example on:coordinate_matrix$
+    from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
+
+    # Create an RDD of coordinate entries.
+    #   - This can be done explicitly with the MatrixEntry class:
+    entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
+    #   - or using (long, long, float) tuples:
+    entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)])
+
+    # Create an CoordinateMatrix from an RDD of MatrixEntries.
+    mat = CoordinateMatrix(entries)
+
+    # Get its size.
+    m = mat.numRows()  # 3
+    n = mat.numCols()  # 2
+
+    # Get the entries as an RDD of MatrixEntries.
+    entriesRDD = mat.entries
+
+    # Convert to a RowMatrix.
+    rowMat = mat.toRowMatrix()
+
+    # Convert to an IndexedRowMatrix.
+    indexedRowMat = mat.toIndexedRowMatrix()
+
+    # Convert to a BlockMatrix.
+    blockMat = mat.toBlockMatrix()
+    # $example off:coordinate_matrix$
+
+
+def __block_matrix():
+    # $example on:block_matrix$
+    from pyspark.mllib.linalg import Matrices
+    from pyspark.mllib.linalg.distributed import BlockMatrix
+
+    # Create an RDD of sub-matrix blocks.
+    blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
+                             ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
+
+    # Create a BlockMatrix from an RDD of sub-matrix blocks.
+    mat = BlockMatrix(blocks, 3, 2)
+
+    # Get its size.
+    m = mat.numRows() # 6
+    n = mat.numCols() # 2
+
+    # Get the blocks as an RDD of sub-matrix blocks.
+    blocksRDD = mat.blocks
+
+    # Convert to a LocalMatrix.
+    localMat = mat.toLocalMatrix()
+
+    # Convert to an IndexedRowMatrix.
+    indexedRowMat = mat.toIndexedRowMatrix()
+
+    # Convert to a CoordinateMatrix.
+    coordinateMat = mat.toCoordinateMatrix()
+    # $example off:block_matrix$
+
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PythonDataTypesExamples")  # SparkContext
+
+    __local_vector_example()
+    __labeled_point_example()
+    __libsvm_example()
+    __local_matrix_example()
+    __row_matrix_example()
+    __indexed_row_matrix_example()
+    __coordinate_matrix_example()
+    __block_matrix()
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
index 223aa93b3f2c1..6b678cd853b31 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
@@ -20,9 +20,7 @@ package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix}
-// $example on:local-vector$
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
-// $example off:local-vector$
 import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
 import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -32,8 +30,10 @@ import org.apache.spark.rdd.RDD
 
 object DataTypesExamples {
 
-  def localVectorExample(): Unit = {
-    // $example on:local-vector$
+  private def localVectorExample(): Unit = {
+    import org.apache.spark.mllib.linalg.{Vector, Vectors}
+
+    // $example on:local_vector$
     // Create a dense vector (1.0, 0.0, 3.0).
     val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
     // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to
@@ -41,20 +41,20 @@ object DataTypesExamples {
     val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))
     // Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries.
     val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
-    // $example off:local-vector$
+    // $example off:local_vector$
   }
 
-  def labeledPointExample(): Unit = {
-    // $example on:labeled-point$
+  private def labeledPointExample(): Unit = {
+    // $example on:labeled_point$
     // Create a labeled point with a positive label and a dense feature vector.
     val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
 
     // Create a labeled point with a negative label and a sparse feature vector.
     val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
-    // $example off:labeled-point$
+    // $example off:labeled_point$
   }
 
-  def libsvmExample(): Unit = {
+  private def libsvmExample(): Unit = {
     val sc = SparkContext.getOrCreate()
     // $example on:libsvm$
     val examples: RDD[LabeledPoint] =
@@ -62,19 +62,19 @@ object DataTypesExamples {
     // $example off:libsvm$
   }
 
-  def localMatrixExample(): Unit = {
-    // $example on:local-matrix$
+  private def localMatrixExample(): Unit = {
+    // $example on:local_matrix$
     // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 
     // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
     val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
-    // $example off:local-matrix$
+    // $example off:local_matrix$
   }
 
-  def rowMatrixExample(): Unit = {
+  private def rowMatrixExample(): Unit = {
     val sc = SparkContext.getOrCreate()
-    // $example on:row-matrix$
+    // $example on:row_matrix$
     val v1 = Vectors.dense(1.0, 10.0, 100.0)
     val v2 = Vectors.dense(2.0, 20.0, 200.0)
     val v3 = Vectors.dense(3.0, 30.0, 300.0)
@@ -89,13 +89,13 @@ object DataTypesExamples {
 
     // QR decomposition
     val qrResult = mat.tallSkinnyQR(true)
-    // $example off:row-matrix$
+    // $example off:row_matrix$
   }
 
-  def indexedRowMatrixExample(): Unit = {
+  private def indexedRowMatrixExample(): Unit = {
     val sc = SparkContext.getOrCreate()
 
-    // $example on:indexed-row-matrix$
+    // $example on:indexed_row_matrix$
     val r0 = IndexedRow(0, Vectors.dense(1, 2, 3))
     val r1 = IndexedRow(1, Vectors.dense(4, 5, 6))
     val r2 = IndexedRow(2, Vectors.dense(7, 8, 9))
@@ -111,13 +111,13 @@ object DataTypesExamples {
 
     // Drop its row indices.
     val rowMat: RowMatrix = mat.toRowMatrix()
-    // $example off:indexed-row-matrix$
+    // $example off:indexed_row_matrix$
   }
 
-  def coordinateMatrixExample(): Unit = {
+  private def coordinateMatrixExample(): Unit = {
     val sc = SparkContext.getOrCreate()
 
-    // $example on:coordinate-row-matrix$
+    // $example on:coordinate_matrix$
     val me1 = MatrixEntry(0, 0, 1.2)
     val me2 = MatrixEntry(1, 0, 2.1)
     val me3 = MatrixEntry(6, 1, 3.7)
@@ -132,13 +132,13 @@ object DataTypesExamples {
 
     // Convert it to an IndexRowMatrix whose rows are sparse vectors.
     val indexedRowMatrix = mat.toIndexedRowMatrix()
-    // $example off:coordinate-row-matrix$
+    // $example off:coordinate_matrix$
   }
 
-  def blockMatrixExample(): Unit = {
+  private def blockMatrixExample(): Unit = {
     val sc = SparkContext.getOrCreate()
 
-    // $example on:block-matrix$
+    // $example on:block_matrix$
     val me1 = MatrixEntry(0, 0, 1.2)
     val me2 = MatrixEntry(1, 0, 2.1)
     val me3 = MatrixEntry(6, 1, 3.7)
@@ -157,7 +157,7 @@ object DataTypesExamples {
 
     // Calculate A^T A.
     val ata = matA.transpose.multiply(matA)
-    // $example off:block-matrix$
+    // $example off:block_matrix$
   }
 
   def main(args: Array[String]): Unit = {

From b0e74e34accd768d2d16e4afc41d46a7cde4da8b Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 15:32:30 -0700
Subject: [PATCH 04/10] add sc

---
 .../examples/mllib/JavaDataTypesExamples.java |  1 -
 .../main/python/mllib/datatypes_examples.py   | 20 +++++++----
 .../examples/mllib/DataTypesExamples.scala    | 34 ++++++++++++++-----
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
index a72a3a41798fa..e98f6ece97c47 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
@@ -170,7 +170,6 @@ private static void blockMatrixExample() {
   }
 
   public static void main(String[] args) {
-
     SparkConf conf = new SparkConf().setAppName("JavaDataTypesExample");
     SparkContext sc = new SparkContext(conf);
 
diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py
index f9e8adf8a9b44..c45b884ec2d42 100644
--- a/examples/src/main/python/mllib/datatypes_examples.py
+++ b/examples/src/main/python/mllib/datatypes_examples.py
@@ -17,10 +17,7 @@
 
 from __future__ import print_function
 
-from numpy import array
-
 from pyspark import SparkContext
-from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
 
 
 def __local_vector_example():
@@ -54,6 +51,8 @@ def __labeled_point_example():
 
 
 def __libsvm_example():
+    sc = SparkContext.getOrCreate()
+
     # $example on:libsvm$
     from pyspark.mllib.util import MLUtils
 
@@ -74,6 +73,8 @@ def __local_matrix_example():
 
 
 def __row_matrix_example():
+    sc = SparkContext.getOrCreate()
+
     # $example on:row_matrix$
     from pyspark.mllib.linalg.distributed import RowMatrix
 
@@ -93,6 +94,8 @@ def __row_matrix_example():
 
 
 def __indexed_row_matrix_example():
+    sc = SparkContext.getOrCreate()
+
     # $example on:indexed_row_matrix$
     from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
 
@@ -122,12 +125,15 @@ def __indexed_row_matrix_example():
 
 
 def __coordinate_matrix_example():
+    sc = SparkContext.getOrCreate()
+
     # $example on:coordinate_matrix$
     from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
 
     # Create an RDD of coordinate entries.
     #   - This can be done explicitly with the MatrixEntry class:
-    entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
+    entries =\
+        sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
     #   - or using (long, long, float) tuples:
     entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)])
 
@@ -153,6 +159,8 @@ def __coordinate_matrix_example():
 
 
 def __block_matrix():
+    sc = SparkContext.getOrCreate()
+
     # $example on:block_matrix$
     from pyspark.mllib.linalg import Matrices
     from pyspark.mllib.linalg.distributed import BlockMatrix
@@ -165,8 +173,8 @@ def __block_matrix():
     mat = BlockMatrix(blocks, 3, 2)
 
     # Get its size.
-    m = mat.numRows() # 6
-    n = mat.numCols() # 2
+    m = mat.numRows()  # 6
+    n = mat.numCols()  # 2
 
     # Get the blocks as an RDD of sub-matrix blocks.
     blocksRDD = mat.blocks
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
index 6b678cd853b31..28c41b8d64988 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
@@ -19,21 +19,14 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.linalg.{Matrices, Matrix}
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
-import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
 
 
 object DataTypesExamples {
 
   private def localVectorExample(): Unit = {
+    // $example on:local_vector$
     import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
-    // $example on:local_vector$
     // Create a dense vector (1.0, 0.0, 3.0).
     val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
     // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to
@@ -46,6 +39,9 @@ object DataTypesExamples {
 
   private def labeledPointExample(): Unit = {
     // $example on:labeled_point$
+    import org.apache.spark.mllib.linalg.Vectors
+    import org.apache.spark.mllib.regression.LabeledPoint
+
     // Create a labeled point with a positive label and a dense feature vector.
     val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
 
@@ -57,6 +53,10 @@ object DataTypesExamples {
   private def libsvmExample(): Unit = {
     val sc = SparkContext.getOrCreate()
     // $example on:libsvm$
+    import org.apache.spark.mllib.regression.LabeledPoint
+    import org.apache.spark.mllib.util.MLUtils
+    import org.apache.spark.rdd.RDD
+
     val examples: RDD[LabeledPoint] =
       MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
     // $example off:libsvm$
@@ -64,6 +64,8 @@ object DataTypesExamples {
 
   private def localMatrixExample(): Unit = {
     // $example on:local_matrix$
+    import org.apache.spark.mllib.linalg.{Matrix, Matrices}
+
     // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 
@@ -75,6 +77,10 @@ object DataTypesExamples {
   private def rowMatrixExample(): Unit = {
     val sc = SparkContext.getOrCreate()
     // $example on:row_matrix$
+    import org.apache.spark.mllib.linalg.{Vector, Vectors}
+    import org.apache.spark.mllib.linalg.distributed.RowMatrix
+    import org.apache.spark.rdd.RDD
+
     val v1 = Vectors.dense(1.0, 10.0, 100.0)
     val v2 = Vectors.dense(2.0, 20.0, 200.0)
     val v3 = Vectors.dense(3.0, 30.0, 300.0)
@@ -96,6 +102,10 @@ object DataTypesExamples {
     val sc = SparkContext.getOrCreate()
 
     // $example on:indexed_row_matrix$
+    import org.apache.spark.mllib.linalg.Vectors
+    import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
+    import org.apache.spark.rdd.RDD
+
     val r0 = IndexedRow(0, Vectors.dense(1, 2, 3))
     val r1 = IndexedRow(1, Vectors.dense(4, 5, 6))
     val r2 = IndexedRow(2, Vectors.dense(7, 8, 9))
@@ -118,6 +128,9 @@ object DataTypesExamples {
     val sc = SparkContext.getOrCreate()
 
     // $example on:coordinate_matrix$
+    import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
+    import org.apache.spark.rdd.RDD
+
     val me1 = MatrixEntry(0, 0, 1.2)
     val me2 = MatrixEntry(1, 0, 2.1)
     val me3 = MatrixEntry(6, 1, 3.7)
@@ -139,6 +152,9 @@ object DataTypesExamples {
     val sc = SparkContext.getOrCreate()
 
     // $example on:block_matrix$
+    import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
+    import org.apache.spark.rdd.RDD
+
     val me1 = MatrixEntry(0, 0, 1.2)
     val me2 = MatrixEntry(1, 0, 2.1)
     val me3 = MatrixEntry(6, 1, 3.7)
@@ -161,7 +177,7 @@ object DataTypesExamples {
   }
 
   def main(args: Array[String]): Unit = {
-    val conf = new SparkConf().setAppName("DataTypeExamples")
+    val conf = new SparkConf().setAppName("DataTypesExamples")
     val sc = new SparkContext(conf)
 
     localVectorExample()

From 51018c96f5329c6bdd7c550456f936d26c5b13c7 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 16:02:32 -0700
Subject: [PATCH 05/10] fix java imports

---
 .../examples/mllib/JavaDataTypesExamples.java | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
index e98f6ece97c47..955e9ac78f65e 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
@@ -21,12 +21,49 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.linalg.distributed.*;
+
+// $example on:local_vector$
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off:local_vector$
+// $example on:labeled_point$
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+// $example off:labeled_point$
+// $example on:libsvm$
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.api.java.JavaRDD;
+// $example off:libsvm$
+// $example on:local_matrix$
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Matrices;
+// $example off:local_matrix$
+// $example on:row_matrix$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.mllib.linalg.QRDecomposition;
+// $example off:row_matrix$
+// $example on:indexed_row_matrix$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.distributed.IndexedRow;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+// $example off:indexed_row_matrix$
+// $example on:coordinate_matrix$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
+// $example off:coordinate_matrix$
+// $example on:block_matrix$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
+import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+// $example off:block_matrix$
 
 
 public class JavaDataTypesExamples {

From 176a2406ae9cf2495d4c1f8eb782c5316929738d Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Thu, 30 Jun 2016 22:59:23 -0700
Subject: [PATCH 06/10] refine error prompt

---
 docs/_plugins/include_example.rb | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index c90d73f024744..e4c383b7ad9c1 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -85,10 +85,11 @@ def select_lines(code)
         .select { |l, i| l.include? "$example off#{@snippet_label}$" }
         .map { |l, i| i }
 
-      raise "Start indices amount is not equal to end indices amount, see #{@file}, #{@snippet_label}." \
+      raise "Start indices amount is not equal to end indices amount, "\
+            "see #{@file}, [labeled=#{@snippet_label}]." \
         unless startIndices.size == endIndices.size
 
-      raise "No code is selected by include_example, see #{@file}, #{@snippet_label}." \
+      raise "No code is selected by include_example, see #{@file}, [labeled=#{@snippet_label}]." \
         if startIndices.size == 0
 
       # Select and join code blocks together, with a space line between each of two continuous
@@ -96,9 +97,11 @@ def select_lines(code)
       lastIndex = -1
       result = ""
       startIndices.zip(endIndices).each do |start, endline|
-        raise "Overlapping between two example code blocks are not allowed, see #{@file}, #{@snippet_label}." \
+        raise "Overlapping between two example code blocks are not allowed, "\
+	      "see #{@file}, [labeled=#{@snippet_label}]." \
             if start <= lastIndex
-        raise "$example on$ should not be in the same line with $example off$, see #{@file}, #{@snippet_label}." \
+        raise "$example on$ should not be in the same line with $example off$, "\
+              "see #{@file}, [labeled=#{@snippet_label}]." \
             if start == endline
         lastIndex = endline
         range = Range.new(start + 1, endline - 1)

From ed271b001f0b34215a11795b737eddb719d81f12 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Sun, 3 Jul 2016 19:37:41 -0700
Subject: [PATCH 07/10] fix QR decompostion error

---
 .../apache/spark/examples/mllib/JavaDataTypesExamples.java  | 2 +-
 examples/src/main/python/mllib/datatypes_examples.py        | 6 +++---
 .../org/apache/spark/examples/mllib/DataTypesExamples.scala | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
index 955e9ac78f65e..1f7f7509b6fcb 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
@@ -118,7 +118,7 @@ private static void rowMatrixExample() {
     Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
 
     // a JavaRDD of local vectors
-    JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3));
+    JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1);
 
     // Create a RowMatrix from an JavaRDD<Vector>.
     RowMatrix mat = new RowMatrix(rows.rdd());
diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py
index c45b884ec2d42..01a4bf509b924 100644
--- a/examples/src/main/python/mllib/datatypes_examples.py
+++ b/examples/src/main/python/mllib/datatypes_examples.py
@@ -79,7 +79,7 @@ def __row_matrix_example():
     from pyspark.mllib.linalg.distributed import RowMatrix
 
     # Create an RDD of vectors.
-    rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+    rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], 1)
 
     # Create a RowMatrix from an RDD of vectors.
     mat = RowMatrix(rows)
@@ -88,8 +88,8 @@ def __row_matrix_example():
     m = mat.numRows()  # 4
     n = mat.numCols()  # 3
 
-    # Get the rows as an RDD of vectors again.
-    rowsRDD = mat.rows
+    # QR decomposition
+    qrResult = mat.tallSkinnyQR(True)
     # $example off:row_matrix$
 
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
index 28c41b8d64988..e408cf614e64f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
@@ -85,7 +85,7 @@ object DataTypesExamples {
     val v2 = Vectors.dense(2.0, 20.0, 200.0)
     val v3 = Vectors.dense(3.0, 30.0, 300.0)
 
-    val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors
+    val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3), 1) // an RDD of local vectors
     // Create a RowMatrix from an RDD[Vector].
     val mat: RowMatrix = new RowMatrix(rows)
 

From 9e102a6cbfa02b8c06f398df2dbba99305669c5d Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Sun, 3 Jul 2016 20:03:48 -0700
Subject: [PATCH 08/10] add spark sqlcontext for toDF

---
 examples/src/main/python/mllib/datatypes_examples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py
index 01a4bf509b924..bf12c273602ae 100644
--- a/examples/src/main/python/mllib/datatypes_examples.py
+++ b/examples/src/main/python/mllib/datatypes_examples.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
+from pyspark.sql import SQLContext
 
 
 def __local_vector_example():
@@ -95,6 +96,7 @@ def __row_matrix_example():
 
 def __indexed_row_matrix_example():
     sc = SparkContext.getOrCreate()
+    sqlContext = SQLContext.getOrCreate(sc)
 
     # $example on:indexed_row_matrix$
     from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

From 47c7b165086324a473dc659fbb216ef6601194bf Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Sun, 3 Jul 2016 20:18:38 -0700
Subject: [PATCH 09/10] fix type erase for RowMatrix

---
 .../org/apache/spark/mllib/linalg/distributed/RowMatrix.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index cd5209d0ebe20..43f89bf91e120 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -50,7 +50,7 @@ class RowMatrix @Since("1.0.0") (
 
   /** Alternative constructor leaving matrix dimensions to be determined automatically. */
   @Since("1.0.0")
-  def this(rows: RDD[Vector]) = this(rows, 0L, 0)
+  def this(rows: RDD[Vector]) = this(rows.retag(classOf[Vector]), 0L, 0)
 
   /** Gets or computes the number of columns. */
   @Since("1.0.0")

From c28fdb8f0adce0daf3078e6fe770cb3464c76ed5 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 8 Jul 2016 11:43:17 -0700
Subject: [PATCH 10/10] revert some code

---
 .../apache/spark/examples/mllib/JavaDataTypesExamples.java    | 4 ++--
 .../org/apache/spark/examples/mllib/DataTypesExamples.scala   | 2 +-
 .../org/apache/spark/mllib/linalg/distributed/RowMatrix.scala | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
index 1f7f7509b6fcb..24926d4201935 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java
@@ -118,7 +118,7 @@ private static void rowMatrixExample() {
     Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
 
     // a JavaRDD of local vectors
-    JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1);
+    JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3));
 
     // Create a RowMatrix from an JavaRDD<Vector>.
     RowMatrix mat = new RowMatrix(rows.rdd());
@@ -189,7 +189,7 @@ private static void blockMatrixExample() {
     MatrixEntry me2 = new MatrixEntry(1, 0, 2.1);
     MatrixEntry me3 = new MatrixEntry(6, 1, 3.7);
 
-    // a JavaRDD of (i, j, v) Matrix Entries
+    // A JavaRDD of (i, j, v) Matrix Entries
     JavaRDD<MatrixEntry> entries = jsc.parallelize(Arrays.asList(me1, me2, me3));
 
     // Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
index e408cf614e64f..28c41b8d64988 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala
@@ -85,7 +85,7 @@ object DataTypesExamples {
     val v2 = Vectors.dense(2.0, 20.0, 200.0)
     val v3 = Vectors.dense(3.0, 30.0, 300.0)
 
-    val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3), 1) // an RDD of local vectors
+    val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors
     // Create a RowMatrix from an RDD[Vector].
     val mat: RowMatrix = new RowMatrix(rows)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 3cea334866619..ec32e37afb792 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -50,7 +50,7 @@ class RowMatrix @Since("1.0.0") (
 
   /** Alternative constructor leaving matrix dimensions to be determined automatically. */
   @Since("1.0.0")
-  def this(rows: RDD[Vector]) = this(rows.retag(classOf[Vector]), 0L, 0)
+  def this(rows: RDD[Vector]) = this(rows, 0L, 0)
 
   /** Gets or computes the number of columns. */
   @Since("1.0.0")