From 8bbd9a9e068c3531857e5235d9fa15d6130f932a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 19 Nov 2015 19:01:22 +0800 Subject: [PATCH 1/5] Refactor StandardScaler --- .../spark/ml/feature/StandardScaler.scala | 27 +++++++++---------- .../ml/feature/StandardScalerSuite.scala | 2 -- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 6d545219ebf49..90e34807c2bf0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -36,20 +36,29 @@ import org.apache.spark.sql.types.{StructField, StructType} private[feature] trait StandardScalerParams extends Params with HasInputCol with HasOutputCol { /** - * Centers the data with mean before scaling. + * Whether to center the data with mean before scaling. * It will build a dense output, so this does not work on sparse input * and will raise an exception. * Default: false * @group param */ - val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean") + val withMean: BooleanParam = new BooleanParam(this, "withMean", "Whether to center data with mean") + + /** @group getParam */ + def getWithMean: Boolean = $(withMean) /** - * Scales the data to unit standard deviation. + * Whether to scale the data to unit standard deviation. * Default: true * @group param */ - val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit standard deviation") + val withStd: BooleanParam = new BooleanParam(this, "withStd", + "Whether to scale the data to unit standard deviation") + + /** @group getParam */ + def getWithStd: Boolean = $(withStd) + + setDefault(withMean -> false, withStd -> true) } /** @@ -63,8 +72,6 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM def this() = this(Identifiable.randomUID("stdScal")) - setDefault(withMean -> false, withStd -> true) - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -123,14 +130,6 @@ class StandardScalerModel private[ml] ( /** Mean of the StandardScalerModel */ val mean: Vector = scaler.mean - /** Whether to scale to unit standard deviation. */ - @Since("1.6.0") - def getWithStd: Boolean = scaler.withStd - - /** Whether to center data with mean. */ - @Since("1.6.0") - def getWithMean: Boolean = scaler.withMean - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala index 49a4b2efe0c29..eebe6b62dcfd1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala @@ -132,7 +132,5 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext val newInstance = testDefaultReadWrite(instance) assert(newInstance.std === instance.std) assert(newInstance.mean === instance.mean) - assert(newInstance.getWithStd === instance.getWithStd) - assert(newInstance.getWithMean === instance.getWithMean) } } From 37fe45de42d850ab2f53020eeac8a37efd9be2f9 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 19 Nov 2015 22:11:10 +0800 Subject: [PATCH 2/5] combine read/write test --- .../spark/ml/feature/StandardScaler.scala | 3 +- .../ml/feature/StandardScalerSuite.scala | 30 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 90e34807c2bf0..228bf222cd114 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -42,7 +42,8 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with * Default: false * @group param */ - val withMean: BooleanParam = new BooleanParam(this, "withMean", "Whether to center data with mean") + val withMean: BooleanParam = new BooleanParam(this, "withMean", + "Whether to center data with mean") /** @group getParam */ def getWithMean: Boolean = $(withMean) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala index eebe6b62dcfd1..647f37977d832 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala @@ -116,21 +116,19 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext assertResult(standardScaler3.transform(df3)) } - test("StandardScaler read/write") { - val t = new StandardScaler() - .setInputCol("myInputCol") - .setOutputCol("myOutputCol") - .setWithStd(false) - .setWithMean(true) - testDefaultReadWrite(t) - } - - test("StandardScalerModel read/write") { - val oldModel = new feature.StandardScalerModel( - Vectors.dense(1.0, 2.0), Vectors.dense(3.0, 4.0), false, true) - val instance = new StandardScalerModel("myStandardScalerModel", oldModel) - val newInstance = testDefaultReadWrite(instance) - assert(newInstance.std === instance.std) - assert(newInstance.mean === instance.mean) + test("read/write") { + def checkModelData(model1: StandardScalerModel, model2: StandardScalerModel): Unit = { + assert(model1.mean === model2.mean) + assert(model1.std === model2.std) + } + val allParams: Map[String, Any] = Map( + "inputCol" -> "features", + "outputCol" -> "standardized_features", + "withMean" -> true, + "withStd" -> true + ) + val df = sqlContext.createDataFrame(data.zip(resWithBoth)).toDF("features", "expected") + val standardScaler = new StandardScaler() + testEstimatorAndModelReadWrite(standardScaler, df, allParams, checkModelData) } } From 76ef33853206b2a95b5cef92ac44233f7a7a861d Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 19 Nov 2015 22:40:38 +0800 Subject: [PATCH 3/5] update doc --- .../main/scala/org/apache/spark/ml/feature/StandardScaler.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 228bf222cd114..91e59b6d2e35d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -189,7 +189,6 @@ object StandardScalerModel extends MLReadable[StandardScalerModel] { sqlContext.read.parquet(dataPath) .select("std", "mean", "withStd", "withMean") .head() - // This is very likely to change in the future because withStd and withMean should be params. val oldModel = new feature.StandardScalerModel(std, mean, withStd, withMean) val model = new StandardScalerModel(metadata.uid, oldModel) DefaultParamsReader.getAndSetParams(model, metadata) From a8a402eaea28081f06a4a51a4010b9eb961aa05a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 20 Nov 2015 18:26:27 +0800 Subject: [PATCH 4/5] refactor ml.StandardScalerModel construction --- .../spark/ml/feature/StandardScaler.scala | 31 ++++++++--------- .../ml/feature/StandardScalerSuite.scala | 33 ++++++++++--------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 91e59b6d2e35d..d76a9c6275e6b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -90,7 +90,7 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v } val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) - copyValues(new StandardScalerModel(uid, scalerModel).setParent(this)) + copyValues(new StandardScalerModel(uid, scalerModel.std, scalerModel.mean).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -116,21 +116,19 @@ object StandardScaler extends DefaultParamsReadable[StandardScaler] { /** * :: Experimental :: * Model fitted by [[StandardScaler]]. + * + * @param std Standard deviation of the StandardScalerModel + * @param mean Mean of the StandardScalerModel */ @Experimental class StandardScalerModel private[ml] ( override val uid: String, - scaler: feature.StandardScalerModel) + val std: Vector, + val mean: Vector) extends Model[StandardScalerModel] with StandardScalerParams with MLWritable { import StandardScalerModel._ - /** Standard deviation of the StandardScalerModel */ - val std: Vector = scaler.std - - /** Mean of the StandardScalerModel */ - val mean: Vector = scaler.mean - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -139,6 +137,7 @@ class StandardScalerModel private[ml] ( override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) + val scaler = new feature.StandardScalerModel(std, mean, $(withStd), $(withMean)) val scale = udf { scaler.transform _ } dataset.withColumn($(outputCol), scale(col($(inputCol)))) } @@ -154,7 +153,7 @@ class StandardScalerModel private[ml] ( } override def copy(extra: ParamMap): StandardScalerModel = { - val copied = new StandardScalerModel(uid, scaler) + val copied = new StandardScalerModel(uid, std, mean) copyValues(copied, extra).setParent(parent) } @@ -168,11 +167,11 @@ object StandardScalerModel extends MLReadable[StandardScalerModel] { private[StandardScalerModel] class StandardScalerModelWriter(instance: StandardScalerModel) extends MLWriter { - private case class Data(std: Vector, mean: Vector, withStd: Boolean, withMean: Boolean) + private case class Data(std: Vector, mean: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = Data(instance.std, instance.mean, instance.getWithStd, instance.getWithMean) + val data = Data(instance.std, instance.mean) val dataPath = new Path(path, "data").toString sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } @@ -185,12 +184,10 @@ object StandardScalerModel extends MLReadable[StandardScalerModel] { override def load(path: String): StandardScalerModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(std: Vector, mean: Vector, withStd: Boolean, withMean: Boolean) = - sqlContext.read.parquet(dataPath) - .select("std", "mean", "withStd", "withMean") - .head() - val oldModel = new feature.StandardScalerModel(std, mean, withStd, withMean) - val model = new StandardScalerModel(metadata.uid, oldModel) + val Row(std: Vector, mean: Vector) = sqlContext.read.parquet(dataPath) + .select("std", "mean") + .head() + val model = new StandardScalerModel(metadata.uid, std, mean) DefaultParamsReader.getAndSetParams(model, metadata) model } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala index 647f37977d832..f433e516d9e98 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala @@ -70,8 +70,8 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext test("params") { ParamsSuite.checkParams(new StandardScaler) - val oldModel = new feature.StandardScalerModel(Vectors.dense(1.0), Vectors.dense(2.0)) - ParamsSuite.checkParams(new StandardScalerModel("empty", oldModel)) + ParamsSuite.checkParams(new StandardScalerModel("empty", + Vectors.dense(1.0), Vectors.dense(2.0))) } test("Standardization with default parameter") { @@ -116,19 +116,20 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext assertResult(standardScaler3.transform(df3)) } - test("read/write") { - def checkModelData(model1: StandardScalerModel, model2: StandardScalerModel): Unit = { - assert(model1.mean === model2.mean) - assert(model1.std === model2.std) - } - val allParams: Map[String, Any] = Map( - "inputCol" -> "features", - "outputCol" -> "standardized_features", - "withMean" -> true, - "withStd" -> true - ) - val df = sqlContext.createDataFrame(data.zip(resWithBoth)).toDF("features", "expected") - val standardScaler = new StandardScaler() - testEstimatorAndModelReadWrite(standardScaler, df, allParams, checkModelData) + test("StandardScaler read/write") { + val t = new StandardScaler() + .setInputCol("myInputCol") + .setOutputCol("myOutputCol") + .setWithMean(true) + .setWithStd(true) + testDefaultReadWrite(t) + } + + test("StandardScalerModel read/write") { + val instance = new StandardScalerModel("myStandardScalerModel", + Vectors.dense(0.5, 1.2), Vectors.dense(1.0, 10.0)) + val newInstance = testDefaultReadWrite(instance) + assert(newInstance.std === instance.std) + assert(newInstance.mean === instance.mean) } } From c6b6d7e65d96b681ecaa6ac004f8637f72bc0802 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 20 Nov 2015 18:32:15 +0800 Subject: [PATCH 5/5] revert test suite --- .../org/apache/spark/ml/feature/StandardScalerSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala index f433e516d9e98..1eae125a524ef 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala @@ -120,14 +120,14 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext val t = new StandardScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") + .setWithStd(false) .setWithMean(true) - .setWithStd(true) testDefaultReadWrite(t) } test("StandardScalerModel read/write") { val instance = new StandardScalerModel("myStandardScalerModel", - Vectors.dense(0.5, 1.2), Vectors.dense(1.0, 10.0)) + Vectors.dense(1.0, 2.0), Vectors.dense(3.0, 4.0)) val newInstance = testDefaultReadWrite(instance) assert(newInstance.std === instance.std) assert(newInstance.mean === instance.mean)