From 0619127ac1a11f94dd5d0400b8768200318e7441 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 11:51:50 +0800 Subject: [PATCH 01/12] create bkm_example --- data/mllib/bisecting_kmeans_data.txt | 6 +++ .../python/mllib/bisecting_k_means_example.py | 50 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 data/mllib/bisecting_kmeans_data.txt create mode 100644 examples/src/main/python/mllib/bisecting_k_means_example.py diff --git a/data/mllib/bisecting_kmeans_data.txt b/data/mllib/bisecting_kmeans_data.txt new file mode 100644 index 0000000000000..be500232c0882 --- /dev/null +++ b/data/mllib/bisecting_kmeans_data.txt @@ -0,0 +1,6 @@ +0.1 0.1 0.1 +0.3 0.3 0.25 +0.1 0.1 -0.1 +20.3 20.1 19.9 +20.2 20.1 19.7 +18.9 20.0 19.7 diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py new file mode 100644 index 0000000000000..ccd7fb4bce362 --- /dev/null +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +# $example on$ +from numpy import array +from math import sqrt +# $example off$ + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="BisectingKMeansExample") # SparkContext + + # $example on$ + # Load and parse the data + data = sc.textFile("data/mllib/bisecting_kmeans_data.txt") + parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) + + # Build the model (cluster the data) + clusters = BisectingKMeans.train(parsedData, 2, maxIterations=5) + + # Evaluate clustering + cost = clusters.computeCost(parsedData) + print("Bisecting K-means cost = " + str(cost)) + + # Save and load model + clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/BisectingKMeansModel") + sameModel = BisectingKMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/BisectingKMeansModel") + # $example off$ + + sc.stop() From 871c5c03d49a187bda5df5c7ec83f9161ddfa624 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 15:47:46 +0800 Subject: [PATCH 02/12] update db --- data/mllib/bisecting_kmeans_data.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data/mllib/bisecting_kmeans_data.txt b/data/mllib/bisecting_kmeans_data.txt index be500232c0882..ff83945804ed3 100644 --- a/data/mllib/bisecting_kmeans_data.txt +++ b/data/mllib/bisecting_kmeans_data.txt @@ -1,6 +1,10 @@ 0.1 0.1 0.1 0.3 0.3 0.25 0.1 0.1 -0.1 +0.0 0.1 0.2 +-0.2 0.0 0.1 20.3 20.1 19.9 20.2 20.1 19.7 18.9 20.0 19.7 +21.0 21.2 19.9 +20.0 19.1 20.0 From 948f50c6bead6286b38a686b57131efbe2f98d1d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 15:50:33 +0800 Subject: [PATCH 03/12] update path --- examples/src/main/python/mllib/bisecting_k_means_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index ccd7fb4bce362..f2d13375aad7f 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -43,8 +43,8 @@ print("Bisecting K-means cost = " + str(cost)) # Save and load model - clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/BisectingKMeansModel") - sameModel = BisectingKMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/BisectingKMeansModel") + clusters.save(sc, "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel") + sameModel = BisectingKMeansModel.load(sc, "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel") # $example off$ sc.stop() From 31fead0419f7ed2f950efa319220791517bb34e1 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 16:16:18 +0800 Subject: [PATCH 04/12] add to ml --- .../python/ml/bisecting_k_means_example.py | 62 +++++++++++++++++++ .../python/mllib/bisecting_k_means_example.py | 2 +- 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 examples/src/main/python/ml/bisecting_k_means_example.py diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py new file mode 100644 index 0000000000000..58a18b59bd1c9 --- /dev/null +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import sys +import re + +import numpy as np +from pyspark import SparkContext +from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel +from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors +from pyspark.sql import SQLContext +from pyspark.sql.types import Row, StructField, StructType + +""" +A simple example demonstrating a bisecting k-means clustering. +""" + +if __name__ == "__main__": + + sc = SparkContext(appName="PythonBisectingKMeansExample") + sqlContext = SQLContext(sc) + + # $example on$ + training = sqlContext.createDataFrame([ + (0, Vectors.dense(0.1, 0.1, 0.1)), + (1, Vectors.dense(0.3, 0.3, 0.25)), + (2, Vectors.dense(0.1, 0.1, -0.1)), + (3, Vectors.dense(20.3, 20.1, 19.9)), + (4, Vectors.dense(20.2, 20.1, 19.7)), + (5, Vectors.dense(18.9, 20.0, 19.7))], ["id", "features"]) + + k = 2 + kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") + + model = kmeans.fit(training) + + # Evaluate clustering + cost = model.computeCost(training) + print("Bisecting K-means Cost = " + str(cost)) + + centers = model.clusterCenters() + print("Cluster Centers: ") + for center in centers: + print(center) + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index f2d13375aad7f..cdad449a51955 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -40,7 +40,7 @@ # Evaluate clustering cost = clusters.computeCost(parsedData) - print("Bisecting K-means cost = " + str(cost)) + print("Bisecting K-means Cost = " + str(cost)) # Save and load model clusters.save(sc, "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel") From b05680f35061ef3fd0aad99d3da121417b4f3cad Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 16:17:39 +0800 Subject: [PATCH 05/12] format --- examples/src/main/python/ml/bisecting_k_means_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 58a18b59bd1c9..d5bbe277a13ab 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -59,4 +59,4 @@ for center in centers: print(center) - sc.stop() \ No newline at end of file + sc.stop() From 6bce85f15dc03385dcbc1b32fa8250006fe7dedb Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 16:36:52 +0800 Subject: [PATCH 06/12] add example off --- examples/src/main/python/ml/bisecting_k_means_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index d5bbe277a13ab..d73b68e0ac039 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -58,5 +58,6 @@ print("Cluster Centers: ") for center in centers: print(center) + # $example off$ sc.stop() From be718beec427bc567cdbd419963ecfb525c96a98 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Mar 2016 16:41:57 +0800 Subject: [PATCH 07/12] format --- examples/src/main/python/mllib/bisecting_k_means_example.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index cdad449a51955..337f478424422 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -43,8 +43,9 @@ print("Bisecting K-means Cost = " + str(cost)) # Save and load model - clusters.save(sc, "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel") - sameModel = BisectingKMeansModel.load(sc, "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel") + path = "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel" + clusters.save(sc, path) + sameModel = BisectingKMeansModel.load(sc, path) # $example off$ sc.stop() From cea8ddfe291b79f19393144145e48286a1e39e8c Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 9 Mar 2016 21:01:45 +0800 Subject: [PATCH 08/12] del unnecessary dataset,import and add missing annotation --- data/mllib/bisecting_kmeans_data.txt | 10 ---------- .../src/main/python/ml/bisecting_k_means_example.py | 7 ++----- .../src/main/python/mllib/bisecting_k_means_example.py | 4 ++-- python/pyspark/mllib/clustering.py | 1 + 4 files changed, 5 insertions(+), 17 deletions(-) delete mode 100644 data/mllib/bisecting_kmeans_data.txt diff --git a/data/mllib/bisecting_kmeans_data.txt b/data/mllib/bisecting_kmeans_data.txt deleted file mode 100644 index ff83945804ed3..0000000000000 --- a/data/mllib/bisecting_kmeans_data.txt +++ /dev/null @@ -1,10 +0,0 @@ -0.1 0.1 0.1 -0.3 0.3 0.25 -0.1 0.1 -0.1 -0.0 0.1 0.2 --0.2 0.0 0.1 -20.3 20.1 19.9 -20.2 20.1 19.7 -18.9 20.0 19.7 -21.0 21.2 19.9 -20.0 19.1 20.0 diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index d73b68e0ac039..bd59adfb5afb7 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -17,15 +17,12 @@ from __future__ import print_function -import sys -import re - -import numpy as np from pyspark import SparkContext +# $example on$ from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors +# $example off$ from pyspark.sql import SQLContext -from pyspark.sql.types import Row, StructField, StructType """ A simple example demonstrating a bisecting k-means clustering. diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index 337f478424422..c959eb9a189e6 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -28,11 +28,11 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="BisectingKMeansExample") # SparkContext + sc = SparkContext(appName="PythonBisectingKMeansExample") # SparkContext # $example on$ # Load and parse the data - data = sc.textFile("data/mllib/bisecting_kmeans_data.txt") + data = sc.textFile("data/mllib/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index 5a5bf59dd5fe3..23d118bd40900 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -142,6 +142,7 @@ class BisectingKMeans(object): .. versionadded:: 2.0.0 """ + @classmethod @since('2.0.0') def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): """ From 399290cd9345bea981963a1df66c403727c82a7f Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 11 Mar 2016 11:12:19 +0800 Subject: [PATCH 09/12] add include_example --- docs/mllib-clustering.md | 6 ++++++ .../python/ml/bisecting_k_means_example.py | 18 ++++++++---------- .../python/mllib/bisecting_k_means_example.py | 7 +++---- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index 44720147be054..6897ba4a5d57d 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -399,6 +399,12 @@ Refer to the [`BisectingKMeans` Java docs](api/java/org/apache/spark/mllib/clust {% include_example java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java %} + +
+Refer to the [`BisectingKMeans` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.BisectingKMeans) and [`BisectingKMeansModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.BisectingKMeansModel) for more details on the API. + +{% include_example python/mllib/bisecting_k_means_example.py %} +
## Streaming k-means diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index bd59adfb5afb7..c3b34d43c304f 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -21,6 +21,8 @@ # $example on$ from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors +from pyspark.mllib.linalg import Vectors +from pyspark.sql.types import Row, StructField, StructType # $example off$ from pyspark.sql import SQLContext @@ -34,16 +36,12 @@ sqlContext = SQLContext(sc) # $example on$ - training = sqlContext.createDataFrame([ - (0, Vectors.dense(0.1, 0.1, 0.1)), - (1, Vectors.dense(0.3, 0.3, 0.25)), - (2, Vectors.dense(0.1, 0.1, -0.1)), - (3, Vectors.dense(20.3, 20.1, 19.9)), - (4, Vectors.dense(20.2, 20.1, 19.7)), - (5, Vectors.dense(18.9, 20.0, 19.7))], ["id", "features"]) - - k = 2 - kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") + data = sc.textFile("data/mllib/kmeans_data.txt") + parsedData = data.map(lambda line: Row(features=Vectors.dense([float(x) for x in line.split(' ')]))) + schema = StructType([StructField("features", VectorUDT(), False)]) + training = sqlContext.createDataFrame(parsedData, schema) + + kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index c959eb9a189e6..7f4d0402d620c 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -19,7 +19,6 @@ # $example on$ from numpy import array -from math import sqrt # $example off$ from pyspark import SparkContext @@ -36,15 +35,15 @@ parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) - clusters = BisectingKMeans.train(parsedData, 2, maxIterations=5) + model = BisectingKMeans.train(parsedData, 2, maxIterations=5) # Evaluate clustering - cost = clusters.computeCost(parsedData) + cost = model.computeCost(parsedData) print("Bisecting K-means Cost = " + str(cost)) # Save and load model path = "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel" - clusters.save(sc, path) + model.save(sc, path) sameModel = BisectingKMeansModel.load(sc, path) # $example off$ From 3ab75336ba854e1801e8c29fa8bd5a1a868a5743 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 11 Mar 2016 11:45:46 +0800 Subject: [PATCH 10/12] reformat --- examples/src/main/python/ml/bisecting_k_means_example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index c3b34d43c304f..b0565e40b662c 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -37,9 +37,9 @@ # $example on$ data = sc.textFile("data/mllib/kmeans_data.txt") - parsedData = data.map(lambda line: Row(features=Vectors.dense([float(x) for x in line.split(' ')]))) - schema = StructType([StructField("features", VectorUDT(), False)]) - training = sqlContext.createDataFrame(parsedData, schema) + parsedData = data.map(lambda line: Row(features=Vectors.dense( + [float(x) for x in line.split(' ')]))) + training = sqlContext.createDataFrame(parsedData) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") From d4415114a46791fe0ed959c1e8c5031bf743568d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 11 Mar 2016 11:46:35 +0800 Subject: [PATCH 11/12] reformat --- examples/src/main/python/ml/bisecting_k_means_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index b0565e40b662c..cbd6bfb0c4c99 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -22,7 +22,7 @@ from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors from pyspark.mllib.linalg import Vectors -from pyspark.sql.types import Row, StructField, StructType +from pyspark.sql.types import Row # $example off$ from pyspark.sql import SQLContext From 165a4fe6f0d8cbc9eb5bcdf161dbfe48b2c1f8f9 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 11 Mar 2016 11:53:51 +0800 Subject: [PATCH 12/12] fix python style --- examples/src/main/python/ml/bisecting_k_means_example.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index cbd6bfb0c4c99..e6f6bfd7e84ed 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -37,9 +37,8 @@ # $example on$ data = sc.textFile("data/mllib/kmeans_data.txt") - parsedData = data.map(lambda line: Row(features=Vectors.dense( - [float(x) for x in line.split(' ')]))) - training = sqlContext.createDataFrame(parsedData) + parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')]))) + training = sqlContext.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")