Add Binarizer ML Transformer.

apache · Apr 25, 2015 · 1682f8c · 1682f8c
1 parent 59b7cfc
commit 1682f8c
Show file tree

Hide file tree

Showing 2 changed files with 147 additions and 0 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.BinaryAttribute
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, StructType}
+
+/**
+ * :: AlphaComponent ::
+ * Binarize a column of continuous features given a threshold.
+ */
+@AlphaComponent
+final class Binarizer extends Transformer
+    with HasInputCol with HasOutputCol with HasThreshold {
+
+  setDefault(threshold -> 0.0)
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /** @group setParam */
+  def setThreshold(value: Double): this.type = set(threshold, value)
+
+  override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
+    transformSchema(dataset.schema, paramMap, logging = true)
+    val map = extractParamMap(paramMap)
+    val threshold = getThreshold
+    val binarizer = udf { in: Double => if (in > threshold) 1.0 else 0.0 }
+    dataset.withColumn(map(outputCol), binarizer(col(map(inputCol))))
+  }
+
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+    val map = extractParamMap(paramMap)
+    SchemaUtils.checkColumnType(schema, map(inputCol), DoubleType)
+
+    val inputFields = schema.fields
+    val outputColName = map(outputCol)
+
+    require(inputFields.forall(_.name != outputColName),
+      s"Output column $outputColName already exists.")
+
+    val attr = BinaryAttribute.defaultAttr.withName(map(outputCol))
+    val outputFields = inputFields :+ attr.toStructField()
+    StructType(outputFields)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+
+class BinarizerSuite extends FunSuite with MLlibTestSparkContext {
+
+  @transient var data: Array[Double] = _
+  @transient var dataFrame: DataFrame = _
+  @transient var binarizer: Binarizer = _
+  @transient val threshold = 0.2
+  @transient var defaultBinarized: Array[Double] = _
+  @transient var thresholdBinarized: Array[Double] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
+    defaultBinarized = data.map(x => if (x > 0.0) 1.0 else 0.0)
+    thresholdBinarized = data.map(x => if (x > threshold) 1.0 else 0.0)
+
+    val sqlContext = new SQLContext(sc)
+    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(BinarizerSuite.FeatureData))
+    binarizer = new Binarizer()
+      .setInputCol("feature")
+      .setOutputCol("binarized_feature")
+  }
+
+  def collectResult(result: DataFrame): Array[Double] = {
+    result.select("binarized_feature").collect().map {
+      case Row(feature: Double) => feature
+    }
+  }
+
+  def assertValues(lhs: Array[Double], rhs: Array[Double]): Unit = {
+    assert((lhs, rhs).zipped.forall { (x1, x2) =>
+      x1 === x2
+    }, "The feature value is not correct after binarization.")
+  }
+
+  test("Binarize continuous features with default parameter") {
+    val result = collectResult(binarizer.transform(dataFrame))
+    assertValues(result, defaultBinarized)
+  }
+
+  test("Binarize continuous features with setter") {
+    binarizer.setThreshold(threshold)
+    val result = collectResult(binarizer.transform(dataFrame))
+    assertValues(result, thresholdBinarized)
+  }
+}
+
+private object BinarizerSuite {
+  case class FeatureData(feature: Double)
+}