From ec2e8f723b1bb9cbb603c0c85b97a2b89dc7c066 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 7 Feb 2016 00:07:15 -0800 Subject: [PATCH 1/3] Remove rounding on integer in MFDataGenerator --- .../scala/org/apache/spark/mllib/util/MFDataGenerator.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index 8af6750da4ff3..dfd4115dda703 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -105,8 +105,7 @@ object MFDataGenerator { // optionally generate testing data if (test) { - val testSampSize = math.min( - math.round(sampSize * testSampFact), math.round(mn - sampSize)).toInt + val testSampSize: Int = math.min(math.round(sampSize * testSampFact).toInt, mn - sampSize) val testOmega = shuffled.slice(sampSize, sampSize + testSampSize) val testOrdered = testOmega.sortWith(_ < _).toArray val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered) From 5418c18f1574dcd11812f8101cc5bcf477394f54 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 7 Feb 2016 00:08:12 -0800 Subject: [PATCH 2/3] remove type --- .../scala/org/apache/spark/mllib/util/MFDataGenerator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index dfd4115dda703..898a09e51636c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -105,7 +105,7 @@ object MFDataGenerator { // optionally generate testing data if (test) { - val testSampSize: Int = math.min(math.round(sampSize * testSampFact).toInt, mn - sampSize) + val testSampSize = math.min(math.round(sampSize * testSampFact).toInt, mn - sampSize) val testOmega = shuffled.slice(sampSize, sampSize + testSampSize) val testOrdered = testOmega.sortWith(_ < _).toArray val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered) From 4deb2491c2af7710afce4f3ffb1935e214e0f74e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 7 Feb 2016 22:57:55 -0800 Subject: [PATCH 3/3] Add an internal version of setRuns for the KMeans MLLib implementation and have it used internally and when called from python. Also add a log error message that the number of runs is deprecated since people could be setting the number of runs through the train function as well --- .../spark/mllib/api/python/PythonMLLibAPI.scala | 2 +- .../org/apache/spark/mllib/clustering/KMeans.scala | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 088ec6a0c0465..93cf16e6f0c2a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -357,7 +357,7 @@ private[python] class PythonMLLibAPI extends Serializable { val kMeansAlg = new KMeans() .setK(k) .setMaxIterations(maxIterations) - .setRuns(runs) + .internalSetRuns(runs) .setInitializationMode(initializationMode) .setInitializationSteps(initializationSteps) .setEpsilon(epsilon) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 901164a391170..67de62bc2e848 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -119,9 +119,18 @@ class KMeans private ( @Since("0.8.0") @deprecated("Support for runs is deprecated. This param will have no effect in 2.0.0.", "1.6.0") def setRuns(runs: Int): this.type = { + internalSetRuns(runs) + } + + // Internal version of setRuns for Python API, this should be removed at the same time as setRuns + // this is done to avoid deprecation warnings in our build. + private[mllib] def internalSetRuns(runs: Int): this.type = { if (runs <= 0) { throw new IllegalArgumentException("Number of runs must be positive") } + if (runs != 1) { + logWarning("Setting number of runs is deprecated and will have no effect in 2.0.0") + } this.runs = runs this } @@ -502,7 +511,7 @@ object KMeans { seed: Long): KMeansModel = { new KMeans().setK(k) .setMaxIterations(maxIterations) - .setRuns(runs) + .internalSetRuns(runs) .setInitializationMode(initializationMode) .setSeed(seed) .run(data) @@ -528,7 +537,7 @@ object KMeans { initializationMode: String): KMeansModel = { new KMeans().setK(k) .setMaxIterations(maxIterations) - .setRuns(runs) + .internalSetRuns(runs) .setInitializationMode(initializationMode) .run(data) }