From 4fa23b4fb74e252bc9b8cd9cda0f1453752639bd Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 12:02:43 -0700 Subject: [PATCH 01/10] add scala example and fix error prompt in include_example --- docs/_plugins/include_example.rb | 8 +- docs/mllib-data-types.md | 96 +--------- .../examples/mllib/DataTypesExamples.scala | 179 ++++++++++++++++++ 3 files changed, 191 insertions(+), 92 deletions(-) create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 306888801df21..c90d73f024744 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -85,10 +85,10 @@ def select_lines(code) .select { |l, i| l.include? "$example off#{@snippet_label}$" } .map { |l, i| i } - raise "Start indices amount is not equal to end indices amount, see #{@file}." \ + raise "Start indices amount is not equal to end indices amount, see #{@file}, #{@snippet_label}." \ unless startIndices.size == endIndices.size - raise "No code is selected by include_example, see #{@file}." \ + raise "No code is selected by include_example, see #{@file}, #{@snippet_label}." \ if startIndices.size == 0 # Select and join code blocks together, with a space line between each of two continuous @@ -96,9 +96,9 @@ def select_lines(code) lastIndex = -1 result = "" startIndices.zip(endIndices).each do |start, endline| - raise "Overlapping between two example code blocks are not allowed, see #{@file}." \ + raise "Overlapping between two example code blocks are not allowed, see #{@file}, #{@snippet_label}." \ if start <= lastIndex - raise "$example on$ should not be in the same line with $example off$, see #{@file}." \ + raise "$example on$ should not be in the same line with $example off$, see #{@file}, #{@snippet_label}." \ if start == endline lastIndex = endline range = Range.new(start + 1, endline - 1) diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index ef56aebbc3608..3f9c092fc814b 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -35,16 +35,7 @@ using the factory methods implemented in Refer to the [`Vector` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.{Vector, Vectors} - -// Create a dense vector (1.0, 0.0, 3.0). -val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) -// Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries. -val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)) -// Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries. -val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) -{% endhighlight %} +{% include_example local_vector scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} ***Note:*** Scala imports `scala.collection.immutable.Vector` by default, so you have to import @@ -127,16 +118,8 @@ A labeled point is represented by the case class Refer to the [`LabeledPoint` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint +{% include_example labeled_point scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -// Create a labeled point with a positive label and a dense feature vector. -val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) - -// Create a labeled point with a negative label and a sparse feature vector. -val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) -{% endhighlight %}
@@ -201,13 +184,8 @@ examples stored in LIBSVM format. Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.rdd.RDD +{% include_example libsvm scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -{% endhighlight %}
@@ -266,15 +244,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order. Refer to the [`Matrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix) and [`Matrices` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.{Matrix, Matrices} +{% include_example local_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) -val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) - -// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) -val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8)) -{% endhighlight %}
@@ -369,21 +340,8 @@ For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_ Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.mllib.linalg.distributed.RowMatrix - -val rows: RDD[Vector] = ... // an RDD of local vectors -// Create a RowMatrix from an RDD[Vector]. -val mat: RowMatrix = new RowMatrix(rows) +{% include_example row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -// Get its size. -val m = mat.numRows() -val n = mat.numCols() - -// QR decomposition -val qrResult = mat.tallSkinnyQR(true) -{% endhighlight %}
@@ -456,20 +414,8 @@ its row indices. Refer to the [`IndexedRowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix} +{% include_example indexed_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -val rows: RDD[IndexedRow] = ... // an RDD of indexed rows -// Create an IndexedRowMatrix from an RDD[IndexedRow]. -val mat: IndexedRowMatrix = new IndexedRowMatrix(rows) - -// Get its size. -val m = mat.numRows() -val n = mat.numCols() - -// Drop its row indices. -val rowMat: RowMatrix = mat.toRowMatrix() -{% endhighlight %}
@@ -562,20 +508,8 @@ with sparse rows by calling `toIndexedRowMatrix`. Other computations for Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} +{% include_example coordinate_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -val entries: RDD[MatrixEntry] = ... // an RDD of matrix entries -// Create a CoordinateMatrix from an RDD[MatrixEntry]. -val mat: CoordinateMatrix = new CoordinateMatrix(entries) - -// Get its size. -val m = mat.numRows() -val n = mat.numCols() - -// Convert it to an IndexRowMatrix whose rows are sparse vectors. -val indexedRowMatrix = mat.toIndexedRowMatrix() -{% endhighlight %}
@@ -670,22 +604,8 @@ Users may change the block size by supplying the values through `toBlockMatrix(r Refer to the [`BlockMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry} - -val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries -// Create a CoordinateMatrix from an RDD[MatrixEntry]. -val coordMat: CoordinateMatrix = new CoordinateMatrix(entries) -// Transform the CoordinateMatrix to a BlockMatrix -val matA: BlockMatrix = coordMat.toBlockMatrix().cache() - -// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid. -// Nothing happens if it is valid. -matA.validate() +{% include_example block_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} -// Calculate A^T A. -val ata = matA.transpose.multiply(matA) -{% endhighlight %}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala new file mode 100644 index 0000000000000..223aa93b3f2c1 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.mllib.linalg.{Matrices, Matrix} +// $example on:local-vector$ +import org.apache.spark.mllib.linalg.{Vector, Vectors} +// $example off:local-vector$ +import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry} +import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.RDD + + +object DataTypesExamples { + + def localVectorExample(): Unit = { + // $example on:local-vector$ + // Create a dense vector (1.0, 0.0, 3.0). + val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) + // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to + // nonzero entries. + val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)) + // Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries. + val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) + // $example off:local-vector$ + } + + def labeledPointExample(): Unit = { + // $example on:labeled-point$ + // Create a labeled point with a positive label and a dense feature vector. + val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) + + // Create a labeled point with a negative label and a sparse feature vector. + val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) + // $example off:labeled-point$ + } + + def libsvmExample(): Unit = { + val sc = SparkContext.getOrCreate() + // $example on:libsvm$ + val examples: RDD[LabeledPoint] = + MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + // $example off:libsvm$ + } + + def localMatrixExample(): Unit = { + // $example on:local-matrix$ + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) + + // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) + val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8)) + // $example off:local-matrix$ + } + + def rowMatrixExample(): Unit = { + val sc = SparkContext.getOrCreate() + // $example on:row-matrix$ + val v1 = Vectors.dense(1.0, 10.0, 100.0) + val v2 = Vectors.dense(2.0, 20.0, 200.0) + val v3 = Vectors.dense(3.0, 30.0, 300.0) + + val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors + // Create a RowMatrix from an RDD[Vector]. + val mat: RowMatrix = new RowMatrix(rows) + + // Get its size. + val m = mat.numRows() + val n = mat.numCols() + + // QR decomposition + val qrResult = mat.tallSkinnyQR(true) + // $example off:row-matrix$ + } + + def indexedRowMatrixExample(): Unit = { + val sc = SparkContext.getOrCreate() + + // $example on:indexed-row-matrix$ + val r0 = IndexedRow(0, Vectors.dense(1, 2, 3)) + val r1 = IndexedRow(1, Vectors.dense(4, 5, 6)) + val r2 = IndexedRow(2, Vectors.dense(7, 8, 9)) + val r3 = IndexedRow(3, Vectors.dense(10, 11, 12)) + + val rows: RDD[IndexedRow] = sc.parallelize(Seq(r0, r1, r2, r3)) // an RDD of indexed rows + // Create an IndexedRowMatrix from an RDD[IndexedRow]. + val mat: IndexedRowMatrix = new IndexedRowMatrix(rows) + + // Get its size. + val m = mat.numRows() + val n = mat.numCols() + + // Drop its row indices. + val rowMat: RowMatrix = mat.toRowMatrix() + // $example off:indexed-row-matrix$ + } + + def coordinateMatrixExample(): Unit = { + val sc = SparkContext.getOrCreate() + + // $example on:coordinate-row-matrix$ + val me1 = MatrixEntry(0, 0, 1.2) + val me2 = MatrixEntry(1, 0, 2.1) + val me3 = MatrixEntry(6, 1, 3.7) + + val entries: RDD[MatrixEntry] = sc.parallelize(Seq(me1, me2, me3)) // an RDD of matrix entries + // Create a CoordinateMatrix from an RDD[MatrixEntry]. + val mat: CoordinateMatrix = new CoordinateMatrix(entries) + + // Get its size. + val m = mat.numRows() + val n = mat.numCols() + + // Convert it to an IndexRowMatrix whose rows are sparse vectors. + val indexedRowMatrix = mat.toIndexedRowMatrix() + // $example off:coordinate-row-matrix$ + } + + def blockMatrixExample(): Unit = { + val sc = SparkContext.getOrCreate() + + // $example on:block-matrix$ + val me1 = MatrixEntry(0, 0, 1.2) + val me2 = MatrixEntry(1, 0, 2.1) + val me3 = MatrixEntry(6, 1, 3.7) + + // an RDD of (i, j, v) matrix entries + val entries: RDD[MatrixEntry] = sc.parallelize(Seq(me1, me2, me3)) + // Create a CoordinateMatrix from an RDD[MatrixEntry]. + val coordMat: CoordinateMatrix = new CoordinateMatrix(entries) + // Transform the CoordinateMatrix to a BlockMatrix + val matA: BlockMatrix = coordMat.toBlockMatrix().cache() + + // Validate whether the BlockMatrix is set up properly. + // Throws an Exception when it is not valid. + // Nothing happens if it is valid. + matA.validate() + + // Calculate A^T A. + val ata = matA.transpose.multiply(matA) + // $example off:block-matrix$ + } + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("DataTypeExamples") + val sc = new SparkContext(conf) + + localVectorExample() + labeledPointExample() + libsvmExample() + localMatrixExample() + rowMatrixExample() + indexedRowMatrixExample() + coordinateMatrixExample() + blockMatrixExample() + + sc.stop() + } +} +// scalastyle:on println From 5ce1ef1948434162961139222d23ff3bd7fa2b8b Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 14:40:58 -0700 Subject: [PATCH 02/10] change md file --- docs/mllib-data-types.md | 247 +++------------------------------------ 1 file changed, 17 insertions(+), 230 deletions(-) diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 3f9c092fc814b..2ad38dc85ea40 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -54,15 +54,8 @@ using the factory methods implemented in Refer to the [`Vector` Java docs](api/java/org/apache/spark/mllib/linalg/Vector.html) and [`Vectors` Java docs](api/java/org/apache/spark/mllib/linalg/Vectors.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; +{% include_example local_vector java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -// Create a dense vector (1.0, 0.0, 3.0). -Vector dv = Vectors.dense(1.0, 0.0, 3.0); -// Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to nonzero entries. -Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}); -{% endhighlight %}
@@ -83,20 +76,7 @@ in [`Vectors`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) to cr Refer to the [`Vectors` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) for more details on the API. -{% highlight python %} -import numpy as np -import scipy.sparse as sps -from pyspark.mllib.linalg import Vectors - -# Use a NumPy array as a dense vector. -dv1 = np.array([1.0, 0.0, 3.0]) -# Use a Python list as a dense vector. -dv2 = [1.0, 0.0, 3.0] -# Create a SparseVector. -sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) -# Use a single-column SciPy csc_matrix as a sparse vector. -sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape = (3, 1)) -{% endhighlight %} +{% include_example local_vector python/mllib/datatypes_examples.py %}
@@ -129,16 +109,8 @@ A labeled point is represented by Refer to the [`LabeledPoint` Java docs](api/java/org/apache/spark/mllib/regression/LabeledPoint.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; - -// Create a labeled point with a positive label and a dense feature vector. -LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); +{% include_example labeled_point java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -// Create a labeled point with a negative label and a sparse feature vector. -LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); -{% endhighlight %}
@@ -148,16 +120,8 @@ A labeled point is represented by Refer to the [`LabeledPoint` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg import SparseVector -from pyspark.mllib.regression import LabeledPoint - -# Create a labeled point with a positive label and a dense feature vector. -pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) +{% include_example labeled_point python/mllib/datatypes_examples.py %} -# Create a labeled point with a negative label and a sparse feature vector. -neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) -{% endhighlight %}
@@ -194,14 +158,8 @@ examples stored in LIBSVM format. Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.api.java.JavaRDD; +{% include_example libsvm java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -JavaRDD examples = - MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); -{% endhighlight %}
@@ -210,11 +168,8 @@ examples stored in LIBSVM format. Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for more details on the API. -{% highlight python %} -from pyspark.mllib.util import MLUtils +{% include_example libsvm python/mllib/datatypes_examples.py %} -examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -{% endhighlight %}
@@ -260,16 +215,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order. Refer to the [`Matrix` Java docs](api/java/org/apache/spark/mllib/linalg/Matrix.html) and [`Matrices` Java docs](api/java/org/apache/spark/mllib/linalg/Matrices.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Matrices; +{% include_example local_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) -Matrix dm = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); - -// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) -Matrix sm = Matrices.sparse(3, 2, new int[] {0, 1, 3}, new int[] {0, 2, 1}, new double[] {9, 6, 8}); -{% endhighlight %}
@@ -284,15 +231,8 @@ matrices. Remember, local matrices in MLlib are stored in column-major order. Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix) and [`Matrices` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg import Matrix, Matrices - -# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) -dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) +{% include_example local_matrix python/mllib/datatypes_examples.py %} -# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) -sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) -{% endhighlight %}
@@ -351,22 +291,8 @@ created from a `JavaRDD` instance. Then we can compute its column summa Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; +{% include_example row_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -JavaRDD rows = ... // a JavaRDD of local vectors -// Create a RowMatrix from an JavaRDD. -RowMatrix mat = new RowMatrix(rows.rdd()); - -// Get its size. -long m = mat.numRows(); -long n = mat.numCols(); - -// QR decomposition -QRDecomposition result = mat.tallSkinnyQR(true); -{% endhighlight %}
@@ -376,24 +302,9 @@ created from an `RDD` of vectors. Refer to the [`RowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg.distributed import RowMatrix - -# Create an RDD of vectors. -rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) - -# Create a RowMatrix from an RDD of vectors. -mat = RowMatrix(rows) +{% include_example row_matrix python/mllib/datatypes_examples.py %} -# Get its size. -m = mat.numRows() # 4 -n = mat.numCols() # 3 - -# Get the rows as an RDD of vectors again. -rowsRDD = mat.rows -{% endhighlight %}
- ### IndexedRowMatrix @@ -429,23 +340,8 @@ its row indices. Refer to the [`IndexedRowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.distributed.IndexedRow; -import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; - -JavaRDD rows = ... // a JavaRDD of indexed rows -// Create an IndexedRowMatrix from a JavaRDD. -IndexedRowMatrix mat = new IndexedRowMatrix(rows.rdd()); +{% include_example indexed_row_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -// Get its size. -long m = mat.numRows(); -long n = mat.numCols(); - -// Drop its row indices. -RowMatrix rowMat = mat.toRowMatrix(); -{% endhighlight %}
@@ -458,34 +354,9 @@ its row indices. Refer to the [`IndexedRowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix - -# Create an RDD of indexed rows. -# - This can be done explicitly with the IndexedRow class: -indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - IndexedRow(1, [4, 5, 6]), - IndexedRow(2, [7, 8, 9]), - IndexedRow(3, [10, 11, 12])]) -# - or by using (long, vector) tuples: -indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), - (2, [7, 8, 9]), (3, [10, 11, 12])]) +{% include_example indexed_row_matrix python/mllib/datatypes_examples.py %} -# Create an IndexedRowMatrix from an RDD of IndexedRows. -mat = IndexedRowMatrix(indexedRows) - -# Get its size. -m = mat.numRows() # 4 -n = mat.numCols() # 3 - -# Get the rows as an RDD of IndexedRows. -rowsRDD = mat.rows - -# Convert to a RowMatrix by dropping the row indices. -rowMat = mat.toRowMatrix() -{% endhighlight %}
- ### CoordinateMatrix @@ -508,7 +379,7 @@ with sparse rows by calling `toIndexedRowMatrix`. Other computations for Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API. -{% include_example coordinate_row_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} +{% include_example coordinate_matrix scala/org/apache/spark/examples/mllib/DataTypesExamples.scala %} @@ -524,23 +395,8 @@ with sparse rows by calling `toIndexedRowMatrix`. Other computations for Refer to the [`CoordinateMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; -import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; -import org.apache.spark.mllib.linalg.distributed.MatrixEntry; +{% include_example coordinate_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -JavaRDD entries = ... // a JavaRDD of matrix entries -// Create a CoordinateMatrix from a JavaRDD. -CoordinateMatrix mat = new CoordinateMatrix(entries.rdd()); - -// Get its size. -long m = mat.numRows(); -long n = mat.numCols(); - -// Convert it to an IndexRowMatrix whose rows are sparse vectors. -IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix(); -{% endhighlight %}
@@ -553,36 +409,9 @@ calling `toRowMatrix`, or to an `IndexedRowMatrix` with sparse rows by calling ` Refer to the [`CoordinateMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.CoordinateMatrix) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry - -# Create an RDD of coordinate entries. -# - This can be done explicitly with the MatrixEntry class: -entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) -# - or using (long, long, float) tuples: -entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) - -# Create an CoordinateMatrix from an RDD of MatrixEntries. -mat = CoordinateMatrix(entries) - -# Get its size. -m = mat.numRows() # 3 -n = mat.numCols() # 2 +{% include_example coordinate_matrix python/mllib/datatypes_examples.py %} -# Get the entries as an RDD of MatrixEntries. -entriesRDD = mat.entries - -# Convert to a RowMatrix. -rowMat = mat.toRowMatrix() - -# Convert to an IndexedRowMatrix. -indexedRowMat = mat.toIndexedRowMatrix() - -# Convert to a BlockMatrix. -blockMat = mat.toBlockMatrix() -{% endhighlight %}
- ### BlockMatrix @@ -617,25 +446,8 @@ Users may change the block size by supplying the values through `toBlockMatrix(r Refer to the [`BlockMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.distributed.BlockMatrix; -import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; -import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; +{% include_example block_matrix java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java %} -JavaRDD entries = ... // a JavaRDD of (i, j, v) Matrix Entries -// Create a CoordinateMatrix from a JavaRDD. -CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd()); -// Transform the CoordinateMatrix to a BlockMatrix -BlockMatrix matA = coordMat.toBlockMatrix().cache(); - -// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid. -// Nothing happens if it is valid. -matA.validate(); - -// Calculate A^T A. -BlockMatrix ata = matA.transpose().multiply(matA); -{% endhighlight %}
@@ -646,32 +458,7 @@ can be created from an `RDD` of sub-matrix blocks, where a sub-matrix block is a Refer to the [`BlockMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.BlockMatrix) for more details on the API. -{% highlight python %} -from pyspark.mllib.linalg import Matrices -from pyspark.mllib.linalg.distributed import BlockMatrix - -# Create an RDD of sub-matrix blocks. -blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - -# Create a BlockMatrix from an RDD of sub-matrix blocks. -mat = BlockMatrix(blocks, 3, 2) - -# Get its size. -m = mat.numRows() # 6 -n = mat.numCols() # 2 - -# Get the blocks as an RDD of sub-matrix blocks. -blocksRDD = mat.blocks - -# Convert to a LocalMatrix. -localMat = mat.toLocalMatrix() - -# Convert to an IndexedRowMatrix. -indexedRowMat = mat.toIndexedRowMatrix() +{% include_example block_matrix python/mllib/datatypes_examples.py %} -# Convert to a CoordinateMatrix. -coordinateMat = mat.toCoordinateMatrix() -{% endhighlight %}
From 922ba78801549a801d3f9567bbb065e6ba7fd0d5 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 14:41:31 -0700 Subject: [PATCH 03/10] refine modifier --- .../examples/mllib/JavaDataTypesExamples.java | 188 +++++++++++++++++ .../main/python/mllib/datatypes_examples.py | 197 ++++++++++++++++++ .../examples/mllib/DataTypesExamples.scala | 48 ++--- 3 files changed, 409 insertions(+), 24 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java create mode 100644 examples/src/main/python/mllib/datatypes_examples.py diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java new file mode 100644 index 0000000000000..a72a3a41798fa --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import java.util.Arrays; + +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.linalg.distributed.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; + + +public class JavaDataTypesExamples { + + private static void localVectorExample() { + // $example on:local_vector$ + // Create a dense vector (1.0, 0.0, 3.0). + Vector dv = Vectors.dense(1.0, 0.0, 3.0); + // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to + // nonzero entries. + Vector sv = Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}); + // $example off:local_vector$ + } + + private static void labeledPointExample() { + // $example on:labeled_point$ + // Create a labeled point with a positive label and a dense feature vector. + LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); + + // Create a labeled point with a negative label and a sparse feature vector. + LabeledPoint neg = + new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); + // $example off:labeled_point$ + } + + private static void libsvmExample() { + // $example on:libsvm$ + SparkContext sc = SparkContext.getOrCreate(); + JavaRDD examples = + MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toJavaRDD(); + // $example off:libsvm$ + } + + private static void localMatrixExample() { + // $example on:local_matrix$ + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix dm = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); + + // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) + Matrix sm = + Matrices.sparse(3, 2, new int[] {0, 1, 3}, new int[] {0, 2, 1}, new double[] {9, 6, 8}); + // $example off:local_matrix$ + } + + private static void rowMatrixExample() { + SparkContext sc = SparkContext.getOrCreate(); + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); + + // $example on:row_matrix$ + Vector v1 = Vectors.dense(1.0, 10.0, 100.0); + Vector v2 = Vectors.dense(2.0, 20.0, 200.0); + Vector v3 = Vectors.dense(3.0, 30.0, 300.0); + + // a JavaRDD of local vectors + JavaRDD rows = jsc.parallelize(Arrays.asList(v1, v2, v3)); + + // Create a RowMatrix from an JavaRDD. + RowMatrix mat = new RowMatrix(rows.rdd()); + + // Get its size. + long m = mat.numRows(); + long n = mat.numCols(); + + // QR decomposition + QRDecomposition result = mat.tallSkinnyQR(true); + // $example off:row_matrix$ + } + + private static void indexedRowMatrixExample() { + SparkContext sc = SparkContext.getOrCreate(); + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); + + // $example on:indexed_row_matrix$ + IndexedRow r0 = new IndexedRow(0, Vectors.dense(1, 2, 3)); + IndexedRow r1 = new IndexedRow(1, Vectors.dense(4, 5, 6)); + IndexedRow r2 = new IndexedRow(2, Vectors.dense(7, 8, 9)); + IndexedRow r3 = new IndexedRow(3, Vectors.dense(10, 11, 12)); + + // a JavaRDD of indexed rows + JavaRDD rows = jsc.parallelize(Arrays.asList(r0, r1, r2, r3)); + + // Create an IndexedRowMatrix from a JavaRDD. + IndexedRowMatrix mat = new IndexedRowMatrix(rows.rdd()); + + // Get its size. + long m = mat.numRows(); + long n = mat.numCols(); + + // Drop its row indices. + RowMatrix rowMat = mat.toRowMatrix(); + // $example off:indexed_row_matrix$ + } + + private static void coordinateMatrixExample() { + SparkContext sc = SparkContext.getOrCreate(); + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); + + // $example on:coordinate_matrix$ + MatrixEntry me1 = new MatrixEntry(0, 0, 1.2); + MatrixEntry me2 = new MatrixEntry(1, 0, 2.1); + MatrixEntry me3 = new MatrixEntry(6, 1, 3.7); + + // a JavaRDD of matrix entries + JavaRDD entries = jsc.parallelize(Arrays.asList(me1, me2, me3)); + // Create a CoordinateMatrix from a JavaRDD. + CoordinateMatrix mat = new CoordinateMatrix(entries.rdd()); + + // Get its size. + long m = mat.numRows(); + long n = mat.numCols(); + + // Convert it to an IndexRowMatrix whose rows are sparse vectors. + IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix(); + // $example off:coordinate_matrix$ + } + + private static void blockMatrixExample() { + SparkContext sc = SparkContext.getOrCreate(); + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); + + // $example on:block_matrix$ + MatrixEntry me1 = new MatrixEntry(0, 0, 1.2); + MatrixEntry me2 = new MatrixEntry(1, 0, 2.1); + MatrixEntry me3 = new MatrixEntry(6, 1, 3.7); + + // a JavaRDD of (i, j, v) Matrix Entries + JavaRDD entries = jsc.parallelize(Arrays.asList(me1, me2, me3)); + + // Create a CoordinateMatrix from a JavaRDD. + CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd()); + // Transform the CoordinateMatrix to a BlockMatrix + BlockMatrix matA = coordMat.toBlockMatrix().cache(); + + // Validate whether the BlockMatrix is set up properly. + // Throws an Exception when it is not valid. Nothing happens if it is valid. + matA.validate(); + + // Calculate A^T A. + BlockMatrix ata = matA.transpose().multiply(matA); + // $example off:block_matrix$ + } + + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaDataTypesExample"); + SparkContext sc = new SparkContext(conf); + + localVectorExample(); + labeledPointExample(); + libsvmExample(); + localMatrixExample(); + rowMatrixExample(); + indexedRowMatrixExample(); + coordinateMatrixExample(); + blockMatrixExample(); + + sc.stop(); + } +} diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py new file mode 100644 index 0000000000000..f9e8adf8a9b44 --- /dev/null +++ b/examples/src/main/python/mllib/datatypes_examples.py @@ -0,0 +1,197 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from numpy import array + +from pyspark import SparkContext +from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel + + +def __local_vector_example(): + # $example on:local_vector$ + import numpy as np + import scipy.sparse as sps + from pyspark.mllib.linalg import Vectors + + # Use a NumPy array as a dense vector. + dv1 = np.array([1.0, 0.0, 3.0]) + # Use a Python list as a dense vector. + dv2 = [1.0, 0.0, 3.0] + # Create a SparseVector. + sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) + # Use a single-column SciPy csc_matrix as a sparse vector. + sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) + # $example off:local_vector$ + + +def __labeled_point_example(): + # $example on:labeled_point$ + from pyspark.mllib.linalg import SparseVector + from pyspark.mllib.regression import LabeledPoint + + # Create a labeled point with a positive label and a dense feature vector. + pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) + + # Create a labeled point with a negative label and a sparse feature vector. + neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) + # $example off:labeled_point$ + + +def __libsvm_example(): + # $example on:libsvm$ + from pyspark.mllib.util import MLUtils + + examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + # $example off:libsvm$ + + +def __local_matrix_example(): + # $example on:local_matrix$ + from pyspark.mllib.linalg import Matrix, Matrices + + # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) + + # Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) + sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) + # $example off:local_matrix$ + + +def __row_matrix_example(): + # $example on:row_matrix$ + from pyspark.mllib.linalg.distributed import RowMatrix + + # Create an RDD of vectors. + rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + + # Create a RowMatrix from an RDD of vectors. + mat = RowMatrix(rows) + + # Get its size. + m = mat.numRows() # 4 + n = mat.numCols() # 3 + + # Get the rows as an RDD of vectors again. + rowsRDD = mat.rows + # $example off:row_matrix$ + + +def __indexed_row_matrix_example(): + # $example on:indexed_row_matrix$ + from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix + + # Create an RDD of indexed rows. + # - This can be done explicitly with the IndexedRow class: + indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + IndexedRow(1, [4, 5, 6]), + IndexedRow(2, [7, 8, 9]), + IndexedRow(3, [10, 11, 12])]) + # - or by using (long, vector) tuples: + indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), + (2, [7, 8, 9]), (3, [10, 11, 12])]) + + # Create an IndexedRowMatrix from an RDD of IndexedRows. + mat = IndexedRowMatrix(indexedRows) + + # Get its size. + m = mat.numRows() # 4 + n = mat.numCols() # 3 + + # Get the rows as an RDD of IndexedRows. + rowsRDD = mat.rows + + # Convert to a RowMatrix by dropping the row indices. + rowMat = mat.toRowMatrix() + # $example off:indexed_row_matrix$ + + +def __coordinate_matrix_example(): + # $example on:coordinate_matrix$ + from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry + + # Create an RDD of coordinate entries. + # - This can be done explicitly with the MatrixEntry class: + entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) + # - or using (long, long, float) tuples: + entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) + + # Create an CoordinateMatrix from an RDD of MatrixEntries. + mat = CoordinateMatrix(entries) + + # Get its size. + m = mat.numRows() # 3 + n = mat.numCols() # 2 + + # Get the entries as an RDD of MatrixEntries. + entriesRDD = mat.entries + + # Convert to a RowMatrix. + rowMat = mat.toRowMatrix() + + # Convert to an IndexedRowMatrix. + indexedRowMat = mat.toIndexedRowMatrix() + + # Convert to a BlockMatrix. + blockMat = mat.toBlockMatrix() + # $example off:coordinate_matrix$ + + +def __block_matrix(): + # $example on:block_matrix$ + from pyspark.mllib.linalg import Matrices + from pyspark.mllib.linalg.distributed import BlockMatrix + + # Create an RDD of sub-matrix blocks. + blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + + # Create a BlockMatrix from an RDD of sub-matrix blocks. + mat = BlockMatrix(blocks, 3, 2) + + # Get its size. + m = mat.numRows() # 6 + n = mat.numCols() # 2 + + # Get the blocks as an RDD of sub-matrix blocks. + blocksRDD = mat.blocks + + # Convert to a LocalMatrix. + localMat = mat.toLocalMatrix() + + # Convert to an IndexedRowMatrix. + indexedRowMat = mat.toIndexedRowMatrix() + + # Convert to a CoordinateMatrix. + coordinateMat = mat.toCoordinateMatrix() + # $example off:block_matrix$ + + +if __name__ == "__main__": + sc = SparkContext(appName="PythonDataTypesExamples") # SparkContext + + __local_vector_example() + __labeled_point_example() + __libsvm_example() + __local_matrix_example() + __row_matrix_example() + __indexed_row_matrix_example() + __coordinate_matrix_example() + __block_matrix() + + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala index 223aa93b3f2c1..6b678cd853b31 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala @@ -20,9 +20,7 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{Matrices, Matrix} -// $example on:local-vector$ import org.apache.spark.mllib.linalg.{Vector, Vectors} -// $example off:local-vector$ import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry} import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix} import org.apache.spark.mllib.regression.LabeledPoint @@ -32,8 +30,10 @@ import org.apache.spark.rdd.RDD object DataTypesExamples { - def localVectorExample(): Unit = { - // $example on:local-vector$ + private def localVectorExample(): Unit = { + import org.apache.spark.mllib.linalg.{Vector, Vectors} + + // $example on:local_vector$ // Create a dense vector (1.0, 0.0, 3.0). val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to @@ -41,20 +41,20 @@ object DataTypesExamples { val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)) // Create a sparse vector (1.0, 0.0, 3.0) by specifying its nonzero entries. val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) - // $example off:local-vector$ + // $example off:local_vector$ } - def labeledPointExample(): Unit = { - // $example on:labeled-point$ + private def labeledPointExample(): Unit = { + // $example on:labeled_point$ // Create a labeled point with a positive label and a dense feature vector. val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) // Create a labeled point with a negative label and a sparse feature vector. val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) - // $example off:labeled-point$ + // $example off:labeled_point$ } - def libsvmExample(): Unit = { + private def libsvmExample(): Unit = { val sc = SparkContext.getOrCreate() // $example on:libsvm$ val examples: RDD[LabeledPoint] = @@ -62,19 +62,19 @@ object DataTypesExamples { // $example off:libsvm$ } - def localMatrixExample(): Unit = { - // $example on:local-matrix$ + private def localMatrixExample(): Unit = { + // $example on:local_matrix$ // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8)) - // $example off:local-matrix$ + // $example off:local_matrix$ } - def rowMatrixExample(): Unit = { + private def rowMatrixExample(): Unit = { val sc = SparkContext.getOrCreate() - // $example on:row-matrix$ + // $example on:row_matrix$ val v1 = Vectors.dense(1.0, 10.0, 100.0) val v2 = Vectors.dense(2.0, 20.0, 200.0) val v3 = Vectors.dense(3.0, 30.0, 300.0) @@ -89,13 +89,13 @@ object DataTypesExamples { // QR decomposition val qrResult = mat.tallSkinnyQR(true) - // $example off:row-matrix$ + // $example off:row_matrix$ } - def indexedRowMatrixExample(): Unit = { + private def indexedRowMatrixExample(): Unit = { val sc = SparkContext.getOrCreate() - // $example on:indexed-row-matrix$ + // $example on:indexed_row_matrix$ val r0 = IndexedRow(0, Vectors.dense(1, 2, 3)) val r1 = IndexedRow(1, Vectors.dense(4, 5, 6)) val r2 = IndexedRow(2, Vectors.dense(7, 8, 9)) @@ -111,13 +111,13 @@ object DataTypesExamples { // Drop its row indices. val rowMat: RowMatrix = mat.toRowMatrix() - // $example off:indexed-row-matrix$ + // $example off:indexed_row_matrix$ } - def coordinateMatrixExample(): Unit = { + private def coordinateMatrixExample(): Unit = { val sc = SparkContext.getOrCreate() - // $example on:coordinate-row-matrix$ + // $example on:coordinate_matrix$ val me1 = MatrixEntry(0, 0, 1.2) val me2 = MatrixEntry(1, 0, 2.1) val me3 = MatrixEntry(6, 1, 3.7) @@ -132,13 +132,13 @@ object DataTypesExamples { // Convert it to an IndexRowMatrix whose rows are sparse vectors. val indexedRowMatrix = mat.toIndexedRowMatrix() - // $example off:coordinate-row-matrix$ + // $example off:coordinate_matrix$ } - def blockMatrixExample(): Unit = { + private def blockMatrixExample(): Unit = { val sc = SparkContext.getOrCreate() - // $example on:block-matrix$ + // $example on:block_matrix$ val me1 = MatrixEntry(0, 0, 1.2) val me2 = MatrixEntry(1, 0, 2.1) val me3 = MatrixEntry(6, 1, 3.7) @@ -157,7 +157,7 @@ object DataTypesExamples { // Calculate A^T A. val ata = matA.transpose.multiply(matA) - // $example off:block-matrix$ + // $example off:block_matrix$ } def main(args: Array[String]): Unit = { From b0e74e34accd768d2d16e4afc41d46a7cde4da8b Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 15:32:30 -0700 Subject: [PATCH 04/10] add sc --- .../examples/mllib/JavaDataTypesExamples.java | 1 - .../main/python/mllib/datatypes_examples.py | 20 +++++++---- .../examples/mllib/DataTypesExamples.scala | 34 ++++++++++++++----- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java index a72a3a41798fa..e98f6ece97c47 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java @@ -170,7 +170,6 @@ private static void blockMatrixExample() { } public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaDataTypesExample"); SparkContext sc = new SparkContext(conf); diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py index f9e8adf8a9b44..c45b884ec2d42 100644 --- a/examples/src/main/python/mllib/datatypes_examples.py +++ b/examples/src/main/python/mllib/datatypes_examples.py @@ -17,10 +17,7 @@ from __future__ import print_function -from numpy import array - from pyspark import SparkContext -from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel def __local_vector_example(): @@ -54,6 +51,8 @@ def __labeled_point_example(): def __libsvm_example(): + sc = SparkContext.getOrCreate() + # $example on:libsvm$ from pyspark.mllib.util import MLUtils @@ -74,6 +73,8 @@ def __local_matrix_example(): def __row_matrix_example(): + sc = SparkContext.getOrCreate() + # $example on:row_matrix$ from pyspark.mllib.linalg.distributed import RowMatrix @@ -93,6 +94,8 @@ def __row_matrix_example(): def __indexed_row_matrix_example(): + sc = SparkContext.getOrCreate() + # $example on:indexed_row_matrix$ from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix @@ -122,12 +125,15 @@ def __indexed_row_matrix_example(): def __coordinate_matrix_example(): + sc = SparkContext.getOrCreate() + # $example on:coordinate_matrix$ from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry # Create an RDD of coordinate entries. # - This can be done explicitly with the MatrixEntry class: - entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) + entries =\ + sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) # - or using (long, long, float) tuples: entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) @@ -153,6 +159,8 @@ def __coordinate_matrix_example(): def __block_matrix(): + sc = SparkContext.getOrCreate() + # $example on:block_matrix$ from pyspark.mllib.linalg import Matrices from pyspark.mllib.linalg.distributed import BlockMatrix @@ -165,8 +173,8 @@ def __block_matrix(): mat = BlockMatrix(blocks, 3, 2) # Get its size. - m = mat.numRows() # 6 - n = mat.numCols() # 2 + m = mat.numRows() # 6 + n = mat.numCols() # 2 # Get the blocks as an RDD of sub-matrix blocks. blocksRDD = mat.blocks diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala index 6b678cd853b31..28c41b8d64988 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala @@ -19,21 +19,14 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.linalg.{Matrices, Matrix} -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry} -import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.rdd.RDD object DataTypesExamples { private def localVectorExample(): Unit = { + // $example on:local_vector$ import org.apache.spark.mllib.linalg.{Vector, Vectors} - // $example on:local_vector$ // Create a dense vector (1.0, 0.0, 3.0). val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) // Create a sparse vector (1.0, 0.0, 3.0) by specifying its indices and values corresponding to @@ -46,6 +39,9 @@ object DataTypesExamples { private def labeledPointExample(): Unit = { // $example on:labeled_point$ + import org.apache.spark.mllib.linalg.Vectors + import org.apache.spark.mllib.regression.LabeledPoint + // Create a labeled point with a positive label and a dense feature vector. val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) @@ -57,6 +53,10 @@ object DataTypesExamples { private def libsvmExample(): Unit = { val sc = SparkContext.getOrCreate() // $example on:libsvm$ + import org.apache.spark.mllib.regression.LabeledPoint + import org.apache.spark.mllib.util.MLUtils + import org.apache.spark.rdd.RDD + val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // $example off:libsvm$ @@ -64,6 +64,8 @@ object DataTypesExamples { private def localMatrixExample(): Unit = { // $example on:local_matrix$ + import org.apache.spark.mllib.linalg.{Matrix, Matrices} + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) @@ -75,6 +77,10 @@ object DataTypesExamples { private def rowMatrixExample(): Unit = { val sc = SparkContext.getOrCreate() // $example on:row_matrix$ + import org.apache.spark.mllib.linalg.{Vector, Vectors} + import org.apache.spark.mllib.linalg.distributed.RowMatrix + import org.apache.spark.rdd.RDD + val v1 = Vectors.dense(1.0, 10.0, 100.0) val v2 = Vectors.dense(2.0, 20.0, 200.0) val v3 = Vectors.dense(3.0, 30.0, 300.0) @@ -96,6 +102,10 @@ object DataTypesExamples { val sc = SparkContext.getOrCreate() // $example on:indexed_row_matrix$ + import org.apache.spark.mllib.linalg.Vectors + import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix} + import org.apache.spark.rdd.RDD + val r0 = IndexedRow(0, Vectors.dense(1, 2, 3)) val r1 = IndexedRow(1, Vectors.dense(4, 5, 6)) val r2 = IndexedRow(2, Vectors.dense(7, 8, 9)) @@ -118,6 +128,9 @@ object DataTypesExamples { val sc = SparkContext.getOrCreate() // $example on:coordinate_matrix$ + import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} + import org.apache.spark.rdd.RDD + val me1 = MatrixEntry(0, 0, 1.2) val me2 = MatrixEntry(1, 0, 2.1) val me3 = MatrixEntry(6, 1, 3.7) @@ -139,6 +152,9 @@ object DataTypesExamples { val sc = SparkContext.getOrCreate() // $example on:block_matrix$ + import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry} + import org.apache.spark.rdd.RDD + val me1 = MatrixEntry(0, 0, 1.2) val me2 = MatrixEntry(1, 0, 2.1) val me3 = MatrixEntry(6, 1, 3.7) @@ -161,7 +177,7 @@ object DataTypesExamples { } def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("DataTypeExamples") + val conf = new SparkConf().setAppName("DataTypesExamples") val sc = new SparkContext(conf) localVectorExample() From 51018c96f5329c6bdd7c550456f936d26c5b13c7 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 16:02:32 -0700 Subject: [PATCH 05/10] fix java imports --- .../examples/mllib/JavaDataTypesExamples.java | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java index e98f6ece97c47..955e9ac78f65e 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java @@ -21,12 +21,49 @@ import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.linalg.distributed.*; + +// $example on:local_vector$ +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +// $example off:local_vector$ +// $example on:labeled_point$ +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +// $example off:labeled_point$ +// $example on:libsvm$ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.api.java.JavaRDD; +// $example off:libsvm$ +// $example on:local_matrix$ +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Matrices; +// $example off:local_matrix$ +// $example on:row_matrix$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.distributed.RowMatrix; +import org.apache.spark.mllib.linalg.QRDecomposition; +// $example off:row_matrix$ +// $example on:indexed_row_matrix$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.distributed.IndexedRow; +import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; +import org.apache.spark.mllib.linalg.distributed.RowMatrix; +// $example off:indexed_row_matrix$ +// $example on:coordinate_matrix$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; +import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; +import org.apache.spark.mllib.linalg.distributed.MatrixEntry; +// $example off:coordinate_matrix$ +// $example on:block_matrix$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.distributed.BlockMatrix; +import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; +import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; +// $example off:block_matrix$ public class JavaDataTypesExamples { From 176a2406ae9cf2495d4c1f8eb782c5316929738d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 30 Jun 2016 22:59:23 -0700 Subject: [PATCH 06/10] refine error prompt --- docs/_plugins/include_example.rb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index c90d73f024744..e4c383b7ad9c1 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -85,10 +85,11 @@ def select_lines(code) .select { |l, i| l.include? "$example off#{@snippet_label}$" } .map { |l, i| i } - raise "Start indices amount is not equal to end indices amount, see #{@file}, #{@snippet_label}." \ + raise "Start indices amount is not equal to end indices amount, "\ + "see #{@file}, [labeled=#{@snippet_label}]." \ unless startIndices.size == endIndices.size - raise "No code is selected by include_example, see #{@file}, #{@snippet_label}." \ + raise "No code is selected by include_example, see #{@file}, [labeled=#{@snippet_label}]." \ if startIndices.size == 0 # Select and join code blocks together, with a space line between each of two continuous @@ -96,9 +97,11 @@ def select_lines(code) lastIndex = -1 result = "" startIndices.zip(endIndices).each do |start, endline| - raise "Overlapping between two example code blocks are not allowed, see #{@file}, #{@snippet_label}." \ + raise "Overlapping between two example code blocks are not allowed, "\ + "see #{@file}, [labeled=#{@snippet_label}]." \ if start <= lastIndex - raise "$example on$ should not be in the same line with $example off$, see #{@file}, #{@snippet_label}." \ + raise "$example on$ should not be in the same line with $example off$, "\ + "see #{@file}, [labeled=#{@snippet_label}]." \ if start == endline lastIndex = endline range = Range.new(start + 1, endline - 1) From ed271b001f0b34215a11795b737eddb719d81f12 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 3 Jul 2016 19:37:41 -0700 Subject: [PATCH 07/10] fix QR decompostion error --- .../apache/spark/examples/mllib/JavaDataTypesExamples.java | 2 +- examples/src/main/python/mllib/datatypes_examples.py | 6 +++--- .../org/apache/spark/examples/mllib/DataTypesExamples.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java index 955e9ac78f65e..1f7f7509b6fcb 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java @@ -118,7 +118,7 @@ private static void rowMatrixExample() { Vector v3 = Vectors.dense(3.0, 30.0, 300.0); // a JavaRDD of local vectors - JavaRDD rows = jsc.parallelize(Arrays.asList(v1, v2, v3)); + JavaRDD rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); // Create a RowMatrix from an JavaRDD. RowMatrix mat = new RowMatrix(rows.rdd()); diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py index c45b884ec2d42..01a4bf509b924 100644 --- a/examples/src/main/python/mllib/datatypes_examples.py +++ b/examples/src/main/python/mllib/datatypes_examples.py @@ -79,7 +79,7 @@ def __row_matrix_example(): from pyspark.mllib.linalg.distributed import RowMatrix # Create an RDD of vectors. - rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], 1) # Create a RowMatrix from an RDD of vectors. mat = RowMatrix(rows) @@ -88,8 +88,8 @@ def __row_matrix_example(): m = mat.numRows() # 4 n = mat.numCols() # 3 - # Get the rows as an RDD of vectors again. - rowsRDD = mat.rows + # QR decomposition + qrResult = mat.tallSkinnyQR(True) # $example off:row_matrix$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala index 28c41b8d64988..e408cf614e64f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala @@ -85,7 +85,7 @@ object DataTypesExamples { val v2 = Vectors.dense(2.0, 20.0, 200.0) val v3 = Vectors.dense(3.0, 30.0, 300.0) - val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors + val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3), 1) // an RDD of local vectors // Create a RowMatrix from an RDD[Vector]. val mat: RowMatrix = new RowMatrix(rows) From 9e102a6cbfa02b8c06f398df2dbba99305669c5d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 3 Jul 2016 20:03:48 -0700 Subject: [PATCH 08/10] add spark sqlcontext for toDF --- examples/src/main/python/mllib/datatypes_examples.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/src/main/python/mllib/datatypes_examples.py b/examples/src/main/python/mllib/datatypes_examples.py index 01a4bf509b924..bf12c273602ae 100644 --- a/examples/src/main/python/mllib/datatypes_examples.py +++ b/examples/src/main/python/mllib/datatypes_examples.py @@ -18,6 +18,7 @@ from __future__ import print_function from pyspark import SparkContext +from pyspark.sql import SQLContext def __local_vector_example(): @@ -95,6 +96,7 @@ def __row_matrix_example(): def __indexed_row_matrix_example(): sc = SparkContext.getOrCreate() + sqlContext = SQLContext.getOrCreate(sc) # $example on:indexed_row_matrix$ from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix From 47c7b165086324a473dc659fbb216ef6601194bf Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 3 Jul 2016 20:18:38 -0700 Subject: [PATCH 09/10] fix type erase for RowMatrix --- .../org/apache/spark/mllib/linalg/distributed/RowMatrix.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index cd5209d0ebe20..43f89bf91e120 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -50,7 +50,7 @@ class RowMatrix @Since("1.0.0") ( /** Alternative constructor leaving matrix dimensions to be determined automatically. */ @Since("1.0.0") - def this(rows: RDD[Vector]) = this(rows, 0L, 0) + def this(rows: RDD[Vector]) = this(rows.retag(classOf[Vector]), 0L, 0) /** Gets or computes the number of columns. */ @Since("1.0.0") From c28fdb8f0adce0daf3078e6fe770cb3464c76ed5 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 8 Jul 2016 11:43:17 -0700 Subject: [PATCH 10/10] revert some code --- .../apache/spark/examples/mllib/JavaDataTypesExamples.java | 4 ++-- .../org/apache/spark/examples/mllib/DataTypesExamples.scala | 2 +- .../org/apache/spark/mllib/linalg/distributed/RowMatrix.scala | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java index 1f7f7509b6fcb..24926d4201935 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDataTypesExamples.java @@ -118,7 +118,7 @@ private static void rowMatrixExample() { Vector v3 = Vectors.dense(3.0, 30.0, 300.0); // a JavaRDD of local vectors - JavaRDD rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); + JavaRDD rows = jsc.parallelize(Arrays.asList(v1, v2, v3)); // Create a RowMatrix from an JavaRDD. RowMatrix mat = new RowMatrix(rows.rdd()); @@ -189,7 +189,7 @@ private static void blockMatrixExample() { MatrixEntry me2 = new MatrixEntry(1, 0, 2.1); MatrixEntry me3 = new MatrixEntry(6, 1, 3.7); - // a JavaRDD of (i, j, v) Matrix Entries + // A JavaRDD of (i, j, v) Matrix Entries JavaRDD entries = jsc.parallelize(Arrays.asList(me1, me2, me3)); // Create a CoordinateMatrix from a JavaRDD. diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala index e408cf614e64f..28c41b8d64988 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DataTypesExamples.scala @@ -85,7 +85,7 @@ object DataTypesExamples { val v2 = Vectors.dense(2.0, 20.0, 200.0) val v3 = Vectors.dense(3.0, 30.0, 300.0) - val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3), 1) // an RDD of local vectors + val rows: RDD[Vector] = sc.parallelize(Seq(v1, v2, v3)) // an RDD of local vectors // Create a RowMatrix from an RDD[Vector]. val mat: RowMatrix = new RowMatrix(rows) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 3cea334866619..ec32e37afb792 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -50,7 +50,7 @@ class RowMatrix @Since("1.0.0") ( /** Alternative constructor leaving matrix dimensions to be determined automatically. */ @Since("1.0.0") - def this(rows: RDD[Vector]) = this(rows.retag(classOf[Vector]), 0L, 0) + def this(rows: RDD[Vector]) = this(rows, 0L, 0) /** Gets or computes the number of columns. */ @Since("1.0.0")