From 158a766b13b4037ad0bb736cf506c63be1d0e991 Mon Sep 17 00:00:00 2001 From: tanyinyan Date: Tue, 17 Mar 2015 09:46:52 +0800 Subject: [PATCH 1/7] Update SVM.scala --- .../main/scala/org/apache/spark/mllib/classification/SVM.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index cfc7f868a02f0..7b0b22992f4d9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -128,6 +128,7 @@ class SVMWithSGD private ( private var miniBatchFraction: Double) extends GeneralizedLinearAlgorithm[SVMModel] with Serializable { + this.setFeatureScaling(true) private val gradient = new HingeGradient() private val updater = new SquaredL2Updater() override val optimizer = new GradientDescent(gradient, updater) From 249d36ac5979cd364de6fec197bcf4c090cf44e0 Mon Sep 17 00:00:00 2001 From: tanyinyan Date: Wed, 18 Mar 2015 09:11:27 +0800 Subject: [PATCH 2/7] Update SVM.scala --- .../main/scala/org/apache/spark/mllib/classification/SVM.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 7b0b22992f4d9..40ab273e3f534 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -121,14 +121,13 @@ object SVMModel extends Loader[SVMModel] { * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. * NOTE: Labels used in SVM should be {0, 1}. */ -class SVMWithSGD private ( +class SVMWithSGD ( private var stepSize: Double, private var numIterations: Int, private var regParam: Double, private var miniBatchFraction: Double) extends GeneralizedLinearAlgorithm[SVMModel] with Serializable { - this.setFeatureScaling(true) private val gradient = new HingeGradient() private val updater = new SquaredL2Updater() override val optimizer = new GradientDescent(gradient, updater) From ef437cbae52abfa194148f0fbd194c7a9b9697ff Mon Sep 17 00:00:00 2001 From: tanyinyan Date: Wed, 18 Mar 2015 14:27:50 +0800 Subject: [PATCH 3/7] Update SVM.scala provide a interface in object SVMWithSGD,to set useFeatureScaling --- .../spark/mllib/classification/SVM.scala | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 40ab273e3f534..e4d5c808609ac 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -121,7 +121,7 @@ object SVMModel extends Loader[SVMModel] { * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. * NOTE: Labels used in SVM should be {0, 1}. */ -class SVMWithSGD ( +class SVMWithSGD private ( private var stepSize: Double, private var numIterations: Int, private var regParam: Double, @@ -152,6 +152,36 @@ class SVMWithSGD ( * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}. */ object SVMWithSGD { + + /** + * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number + * of iterations of gradient descent using the specified step size. Each iteration uses + * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in + * gradient descent are initialized using the initial weights provided. + * + * NOTE: Labels used in SVM should be {0, 1}. + * + * @param input RDD of (label, array of features) pairs. + * @param numIterations Number of iterations of gradient descent to run. + * @param stepSize Step size to be used for each iteration of gradient descent. + * @param regParam Regularization parameter. + * @param miniBatchFraction Fraction of data to be used per iteration. + * @param initialWeights Initial set of weights to be used. Array should be equal in size to + * the number of features in the data. + * @param useFeatureScaling Set if the algorithm should use feature scaling to improve the convergence during optimization. + */ + def train( + input: RDD[LabeledPoint], + numIterations: Int, + stepSize: Double, + regParam: Double, + miniBatchFraction: Double, + initialWeights: Vector, + useFeatureScaling: Boolean): SVMModel = { + new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).setFeatureScaling(useFeatureScaling) + .run(input, initialWeights) + } + /** * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number From 26558da3edd18417b9d3d287d9142fa199ef04b5 Mon Sep 17 00:00:00 2001 From: tanyinyan Date: Wed, 18 Mar 2015 14:38:37 +0800 Subject: [PATCH 4/7] Update SVM.scala --- .../scala/org/apache/spark/mllib/classification/SVM.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index e4d5c808609ac..0c89d0fa1902b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -168,7 +168,8 @@ object SVMWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. - * @param useFeatureScaling Set if the algorithm should use feature scaling to improve the convergence during optimization. + * @param useFeatureScaling Set if the algorithm should use feature scaling to improve the + * convergence during optimization. */ def train( input: RDD[LabeledPoint], @@ -178,7 +179,8 @@ object SVMWithSGD { miniBatchFraction: Double, initialWeights: Vector, useFeatureScaling: Boolean): SVMModel = { - new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).setFeatureScaling(useFeatureScaling) + new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction) + .setFeatureScaling(useFeatureScaling) .run(input, initialWeights) } From 3c622f8824f22fa143bf42c4c130b4991ea0fd44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=AD=E9=93=B6=E7=87=95?= Date: Thu, 19 Mar 2015 09:33:21 +0800 Subject: [PATCH 5/7] Update SVM.scala --- .../spark/mllib/classification/SVM.scala | 35 +------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 0c89d0fa1902b..45b1d91938154 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -121,7 +121,7 @@ object SVMModel extends Loader[SVMModel] { * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. * NOTE: Labels used in SVM should be {0, 1}. */ -class SVMWithSGD private ( +class SVMWithSGD ( private var stepSize: Double, private var numIterations: Int, private var regParam: Double, @@ -152,39 +152,6 @@ class SVMWithSGD private ( * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}. */ object SVMWithSGD { - - /** - * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number - * of iterations of gradient descent using the specified step size. Each iteration uses - * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in - * gradient descent are initialized using the initial weights provided. - * - * NOTE: Labels used in SVM should be {0, 1}. - * - * @param input RDD of (label, array of features) pairs. - * @param numIterations Number of iterations of gradient descent to run. - * @param stepSize Step size to be used for each iteration of gradient descent. - * @param regParam Regularization parameter. - * @param miniBatchFraction Fraction of data to be used per iteration. - * @param initialWeights Initial set of weights to be used. Array should be equal in size to - * the number of features in the data. - * @param useFeatureScaling Set if the algorithm should use feature scaling to improve the - * convergence during optimization. - */ - def train( - input: RDD[LabeledPoint], - numIterations: Int, - stepSize: Double, - regParam: Double, - miniBatchFraction: Double, - initialWeights: Vector, - useFeatureScaling: Boolean): SVMModel = { - new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction) - .setFeatureScaling(useFeatureScaling) - .run(input, initialWeights) - } - - /** * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number * of iterations of gradient descent using the specified step size. Each iteration uses From 2dc9cb886eaaf27f3bdf761b17da18692ead0906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=AD=E9=93=B6=E7=87=95?= Date: Thu, 19 Mar 2015 14:28:06 +0800 Subject: [PATCH 6/7] Update GeneralizedLinearAlgorithm.scala make setFeatureScaling public --- .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 7c66e8cdebdbe..31e77d79332e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -131,7 +131,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * Set if the algorithm should use feature scaling to improve the convergence during optimization. */ - private[mllib] def setFeatureScaling(useFeatureScaling: Boolean): this.type = { + def setFeatureScaling(useFeatureScaling: Boolean): this.type = { this.useFeatureScaling = useFeatureScaling this } From 32c8507edbbbfe33b6a9b162b1c491b41e577e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=AD=E9=93=B6=E7=87=95?= Date: Fri, 20 Mar 2015 10:16:52 +0800 Subject: [PATCH 7/7] Update SVM.scala Document the params of SVMWithSGD constructor and mark it as ::Experimental:: --- .../scala/org/apache/spark/mllib/classification/SVM.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 45b1d91938154..d03cad9983377 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -117,10 +117,18 @@ object SVMModel extends Loader[SVMModel] { } /** + * :: Experimental :: * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2 * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. + * * NOTE: Labels used in SVM should be {0, 1}. + * + * @param stepSize Step size to be used for each iteration of gradient descent. + * @param numIterations Number of iterations of gradient descent to run. + * @param regParam Regularization parameter. + * @param miniBatchFraction Fraction of data to be used per iteration. */ +@Experimental class SVMWithSGD ( private var stepSize: Double, private var numIterations: Int,