From afc1bc72d9e673b1f00a5d22a7cc32000a6026c6 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 14 Aug 2015 12:15:20 -0700 Subject: [PATCH 1/6] Fri Aug 14 12:15:20 PDT 2015 --- docs/ml-features.md | 116 ++++++++++++++++++ .../apache/spark/ml/feature/RFormula.scala | 4 +- 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index cec2cbe673407..2d1a45c357d6c 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1387,5 +1387,121 @@ print(output.select("features", "clicked").first()) +## RFormula + +`RFormula` encodes a string column of labels to a column of label indices. +The indices are in `[0, numLabels)`, ordered by label frequencies. +So the most frequent label gets index `0`. +If the input column is numeric, we cast it to string and index the string values. + +**Examples** + +Assume that we have the following DataFrame with columns `id` and `category`: + +~~~~ + id | category +----|---------- + 0 | a + 1 | b + 2 | c + 3 | a + 4 | a + 5 | c +~~~~ + +`category` is a string column with three labels: "a", "b", and "c". +Applying `RFormula` with `category` as the input column and `categoryIndex` as the output +column, we should get the following: + +~~~~ + id | category | categoryIndex +----|----------|--------------- + 0 | a | 0.0 + 1 | b | 2.0 + 2 | c | 1.0 + 3 | a | 0.0 + 4 | a | 0.0 + 5 | c | 1.0 +~~~~ + +"a" gets index `0` because it is the most frequent, followed by "c" with index `1` and "b" with +index `2`. + +
+ +
+ +[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an input +column name and an output column name. + +{% highlight scala %} +import org.apache.spark.ml.feature.RFormula + +val df = sqlContext.createDataFrame( + Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) +).toDF("id", "category") +val indexer = new RFormula() + .setInputCol("category") + .setOutputCol("categoryIndex") +val indexed = indexer.fit(df).transform(df) +indexed.show() +{% endhighlight %} +
+ +
+[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an input column +name and an output column name. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import static org.apache.spark.sql.types.DataTypes.*; + +JavaRDD jrdd = jsc.parallelize(Arrays.asList( + RowFactory.create(0, "a"), + RowFactory.create(1, "b"), + RowFactory.create(2, "c"), + RowFactory.create(3, "a"), + RowFactory.create(4, "a"), + RowFactory.create(5, "c") +)); +StructType schema = new StructType(new StructField[] { + createStructField("id", DoubleType, false), + createStructField("category", StringType, false) +}); +DataFrame df = sqlContext.createDataFrame(jrdd, schema); +RFormula indexer = new RFormula() + .setInputCol("category") + .setOutputCol("categoryIndex"); +DataFrame indexed = indexer.fit(df).transform(df); +indexed.show(); +{% endhighlight %} +
+ +
+ +[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an input +column name and an output column name. + +{% highlight python %} +from pyspark.ml.feature import RFormula + +df = sqlContext.createDataFrame( + [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], + ["id", "category"]) +indexer = RFormula(inputCol="category", outputCol="categoryIndex") +indexed = indexer.fit(df).transform(df) +indexed.show() +{% endhighlight %} +
+
+ # Feature Selectors diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index a752dacd72d95..a7fa50444209b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -42,8 +42,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** * :: Experimental :: * Implements the transforms required for fitting a dataset against an R model formula. Currently - * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula - * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html + * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the + * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html */ @Experimental class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase { From 32ed60e80daff875b05bce161f4ddcb9260a506a Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 18 Aug 2015 18:00:49 -0700 Subject: [PATCH 2/6] formula doc --- docs/ml-features.md | 126 +++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 73 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 2d1a45c357d6c..078ab314ea731 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1386,119 +1386,99 @@ print(output.select("features", "clicked").first()) {% endhighlight %} - ## RFormula -`RFormula` encodes a string column of labels to a column of label indices. -The indices are in `[0, numLabels)`, ordered by label frequencies. -So the most frequent label gets index `0`. -If the input column is numeric, we cast it to string and index the string values. +`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a numeric column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula. **Examples** -Assume that we have the following DataFrame with columns `id` and `category`: - -~~~~ - id | category -----|---------- - 0 | a - 1 | b - 2 | c - 3 | a - 4 | a - 5 | c -~~~~ +Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `clicked`: -`category` is a string column with three labels: "a", "b", and "c". -Applying `RFormula` with `category` as the input column and `categoryIndex` as the output -column, we should get the following: +~~~ +id | country | hour | clicked +---|---------|------|--------- + 7 | "US" | 18 | 1.0 +~~~ -~~~~ - id | category | categoryIndex -----|----------|--------------- - 0 | a | 0.0 - 1 | b | 2.0 - 2 | c | 1.0 - 3 | a | 0.0 - 4 | a | 0.0 - 5 | c | 1.0 -~~~~ +If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to +predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame: -"a" gets index `0` because it is the most frequent, followed by "c" with index `1` and "b" with -index `2`. +~~~ +id | country | hour | clicked | label | features +---|---------|------|---------|-------|----------------------------- + 7 | "US" | 18 | 1.0 | 1.0 | [0.0, 1.0, 18.0] +~~~
-
-[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an input -column name and an output column name. +[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. {% highlight scala %} import org.apache.spark.ml.feature.RFormula -val df = sqlContext.createDataFrame( - Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) -).toDF("id", "category") -val indexer = new RFormula() - .setInputCol("category") - .setOutputCol("categoryIndex") -val indexed = indexer.fit(df).transform(df) -indexed.show() +val dataset = sqlContext.createDataFrame( + Seq((7, "US", 18, 1.0)) +).toDF("id", "country", "hour", "clicked") +val formula = new RFormula() + .setFormula("clicked ~ country + hour") + .setFeaturesCol("features") + .setLabelCol("label") +val output = formula.fit(dataset).transform(dataset) +println(output.select("features", "label").first()) {% endhighlight %}
-[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an input column -name and an output column name. + +[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns. {% highlight java %} import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.ml.feature.RFormula; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.*; import static org.apache.spark.sql.types.DataTypes.*; -JavaRDD jrdd = jsc.parallelize(Arrays.asList( - RowFactory.create(0, "a"), - RowFactory.create(1, "b"), - RowFactory.create(2, "c"), - RowFactory.create(3, "a"), - RowFactory.create(4, "a"), - RowFactory.create(5, "c") -)); -StructType schema = new StructType(new StructField[] { - createStructField("id", DoubleType, false), - createStructField("category", StringType, false) +StructType schema = createStructType(new StructField[] { + createStructField("id", IntegerType, false), + createStructField("country", StringType, false), + createStructField("hour", IntegerType, false), + createStructField("clicked", DoubleType, false) }); -DataFrame df = sqlContext.createDataFrame(jrdd, schema); -RFormula indexer = new RFormula() - .setInputCol("category") - .setOutputCol("categoryIndex"); -DataFrame indexed = indexer.fit(df).transform(df); -indexed.show(); +Row row = RowFactory.create(7, "US", 18, 1.0); +JavaRDD rdd = jsc.parallelize(Arrays.asList(row)); +DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + +RFormula formula = new RFormula() + .setFormula("clicked ~ country + hour") + .setFeaturesCol("features") + .setLabelCol("label") + +DataFrame output = formula.fit(dataset).transform(dataset); +System.out.println(output.select("features", "label").first()); {% endhighlight %}
-[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an input -column name and an output column name. +[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. {% highlight python %} from pyspark.ml.feature import RFormula -df = sqlContext.createDataFrame( - [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], - ["id", "category"]) -indexer = RFormula(inputCol="category", outputCol="categoryIndex") -indexed = indexer.fit(df).transform(df) -indexed.show() +dataset = sqlContext.createDataFrame( + [(7, "US", 18, 1.0)], + ["id", "country", "hour", "clicked"]) +formula = RFormula( + formula="clicked ~ country + hour", + featuresCol="features", + labelCol="label") +output = formula.fit(dataset).transform(dataset) +print(output.select("features", "label").first()) {% endhighlight %}
From 83961ad53d512f9a9b26d9565e00864fd40b0e0d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 18 Aug 2015 18:01:19 -0700 Subject: [PATCH 3/6] Tue Aug 18 18:01:19 PDT 2015 --- docs/ml-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 078ab314ea731..44c2dfa52e6ab 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1388,7 +1388,7 @@ print(output.select("features", "clicked").first()) ## RFormula -`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a numeric column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula. +`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula. **Examples** From 6999d764d8930a8ab100bca75b34faa8e1c98afa Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 19 Aug 2015 11:59:07 -0700 Subject: [PATCH 4/6] comments --- docs/ml-features.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 44c2dfa52e6ab..97e8063e990ad 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1386,6 +1386,7 @@ print(output.select("features", "clicked").first()) {% endhighlight %} + ## RFormula `RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula. @@ -1398,6 +1399,7 @@ Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `c id | country | hour | clicked ---|---------|------|--------- 7 | "US" | 18 | 1.0 + 8 | "CA" | 12 | 0.0 ~~~ If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to @@ -1407,6 +1409,7 @@ predict `clicked` based on `country` and `hour`, after transformation we should id | country | hour | clicked | label | features ---|---------|------|---------|-------|----------------------------- 7 | "US" | 18 | 1.0 | 1.0 | [0.0, 1.0, 18.0] + 8 | "CA" | 12 | 0.0 | 0.0 | [1.0, 0.0, 12.0] ~~~
@@ -1418,7 +1421,8 @@ id | country | hour | clicked | label | features import org.apache.spark.ml.feature.RFormula val dataset = sqlContext.createDataFrame( - Seq((7, "US", 18, 1.0)) + Seq((7, "US", 18, 1.0)), + Seq((8, "CA", 12, 0.0)) ).toDF("id", "country", "hour", "clicked") val formula = new RFormula() .setFormula("clicked ~ country + hour") @@ -1449,14 +1453,15 @@ StructType schema = createStructType(new StructField[] { createStructField("hour", IntegerType, false), createStructField("clicked", DoubleType, false) }); -Row row = RowFactory.create(7, "US", 18, 1.0); -JavaRDD rdd = jsc.parallelize(Arrays.asList(row)); +Row row1 = RowFactory.create(7, "US", 18, 1.0); +Row row2 = RowFactory.create(8, "CA", 12, 0.0); +JavaRDD rdd = jsc.parallelize(Arrays.asList(row1, row2)); DataFrame dataset = sqlContext.createDataFrame(rdd, schema); RFormula formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") - .setLabelCol("label") + .setLabelCol("label"); DataFrame output = formula.fit(dataset).transform(dataset); System.out.println(output.select("features", "label").first()); @@ -1472,6 +1477,7 @@ from pyspark.ml.feature import RFormula dataset = sqlContext.createDataFrame( [(7, "US", 18, 1.0)], + [(8, "CA", 12, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", From b6f699e116336acc47d8fc9d3cd039da76ddf534 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 19 Aug 2015 14:15:29 -0700 Subject: [PATCH 5/6] fix bugs --- docs/ml-features.md | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 97e8063e990ad..740846b728ac2 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1400,16 +1400,18 @@ id | country | hour | clicked ---|---------|------|--------- 7 | "US" | 18 | 1.0 8 | "CA" | 12 | 0.0 + 9 | "NZ" | 15 | 0.0 ~~~ If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame: ~~~ -id | country | hour | clicked | label | features ----|---------|------|---------|-------|----------------------------- - 7 | "US" | 18 | 1.0 | 1.0 | [0.0, 1.0, 18.0] - 8 | "CA" | 12 | 0.0 | 0.0 | [1.0, 0.0, 12.0] +id | country | hour | clicked | features | label +---|---------|------|---------|------------------|------- + 7 | "US" | 18 | 1.0 | [0.0, 0.0, 18.0] | 1.0 + 8 | "CA" | 12 | 0.0 | [0.0, 1.0, 12.0] | 0.0 + 9 | "NZ" | 15 | 0.0 | [1.0, 0.0, 15.0] | 0.0 ~~~
@@ -1420,16 +1422,17 @@ id | country | hour | clicked | label | features {% highlight scala %} import org.apache.spark.ml.feature.RFormula -val dataset = sqlContext.createDataFrame( - Seq((7, "US", 18, 1.0)), - Seq((8, "CA", 12, 0.0)) -).toDF("id", "country", "hour", "clicked") +val dataset = sqlContext.createDataFrame(Seq( + (7, "US", 18, 1.0), + (8, "CA", 12, 0.0), + (9, "NZ", 15, 0.0) +)).toDF("id", "country", "hour", "clicked") val formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label") val output = formula.fit(dataset).transform(dataset) -println(output.select("features", "label").first()) +output.select("features", "label").show() {% endhighlight %}
@@ -1441,6 +1444,7 @@ println(output.select("features", "label").first()) import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -1453,9 +1457,11 @@ StructType schema = createStructType(new StructField[] { createStructField("hour", IntegerType, false), createStructField("clicked", DoubleType, false) }); -Row row1 = RowFactory.create(7, "US", 18, 1.0); -Row row2 = RowFactory.create(8, "CA", 12, 0.0); -JavaRDD rdd = jsc.parallelize(Arrays.asList(row1, row2)); +JavaRDD rdd = jsc.parallelize(Arrays.asList( + RowFactory.create(7, "US", 18, 1.0), + RowFactory.create(8, "CA", 12, 0.0), + RowFactory.create(9, "NZ", 15, 0.0) +)); DataFrame dataset = sqlContext.createDataFrame(rdd, schema); RFormula formula = new RFormula() @@ -1464,7 +1470,7 @@ RFormula formula = new RFormula() .setLabelCol("label"); DataFrame output = formula.fit(dataset).transform(dataset); -System.out.println(output.select("features", "label").first()); +output.select("features", "label").show(); {% endhighlight %}
@@ -1476,15 +1482,16 @@ System.out.println(output.select("features", "label").first()); from pyspark.ml.feature import RFormula dataset = sqlContext.createDataFrame( - [(7, "US", 18, 1.0)], - [(8, "CA", 12, 0.0)], + [(7, "US", 18, 1.0), + (8, "CA", 12, 0.0), + (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) -print(output.select("features", "label").first()) +output.select("features", "label").show() {% endhighlight %} From 562b074ad7373ae9e329e85bc05f2f567ca04c25 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 19 Aug 2015 14:16:46 -0700 Subject: [PATCH 6/6] fix merge --- docs/ml-features.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 4e93a9fdc7b15..6309db97be4d0 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1585,6 +1585,3 @@ output.select("features", "label").show() {% endhighlight %} - -# Feature Selectors -