[SPARK-11723][ML][DOC] Use LibSVM data source rather than MLUtils.loa…

…dLibSVMFile to load DataFrame Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame, include: * Use libSVM data source for all example codes under examples/ml, and remove unused import. * Use libSVM data source for user guides under ml-*** which were omitted by #8697. * Fix bug: We should use ```sqlContext.read().format("libsvm").load(path)``` at Java side, but the API doc and user guides misuse as ```sqlContext.read.format("libsvm").load(path)```. * Code cleanup. mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #9690 from yanboliang/spark-11723.
apache · Nov 13, 2015 · 99693fe · 99693fe
1 parent 61a2848
commit 99693fe
Show file tree

Hide file tree

Showing 26 changed files with 79 additions and 130 deletions.
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
@@ -195,7 +195,7 @@ import org.apache.spark.ml.feature.*;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
@@ -384,7 +384,7 @@ import org.apache.spark.ml.regression.RandomForestRegressor;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
@@ -640,7 +640,7 @@ import org.apache.spark.ml.feature.*;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+DataFrame data sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -830,7 +830,7 @@ import org.apache.spark.ml.regression.GBTRegressor;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -1000,7 +1000,7 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
 JavaSparkContext jsc = new JavaSparkContext(conf);
 SQLContext jsql = new SQLContext(jsc);
 
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_multiclass_classification_data.txt");
 
 DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);

diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1109,7 +1109,7 @@ import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 VectorIndexer indexer = new VectorIndexer()
   .setInputCol("features")
@@ -1187,7 +1187,7 @@ for more details on the API.
 import org.apache.spark.ml.feature.Normalizer;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 
 // Normalize each Vector using $L^1$ norm.
@@ -1273,7 +1273,7 @@ import org.apache.spark.ml.feature.StandardScaler;
 import org.apache.spark.ml.feature.StandardScalerModel;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 StandardScaler scaler = new StandardScaler()
   .setInputCol("features")
@@ -1366,7 +1366,7 @@ import org.apache.spark.ml.feature.MinMaxScaler;
 import org.apache.spark.ml.feature.MinMaxScalerModel;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
   .load("data/mllib/sample_libsvm_data.txt");
 MinMaxScaler scaler = new MinMaxScaler()
   .setInputCol("features")

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
@@ -867,10 +867,9 @@ The `ParamMap` which produces the best evaluation metric is selected as the best
 import org.apache.spark.ml.evaluation.RegressionEvaluator
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
-import org.apache.spark.mllib.util.MLUtils
 
 // Prepare training and test data.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
 
 val lr = new LinearRegression()
@@ -911,14 +910,9 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.regression.LinearRegression;
 import org.apache.spark.ml.tuning.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame data = sqlContext.createDataFrame(
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
-  LabeledPoint.class);
+DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Prepare training and test data.
 DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);

diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
@@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sqlContext.read.format("libsvm").load(path);
+    DataFrame training = sqlContext.read().format("libsvm").load(path);
 
     LogisticRegression lr = new LogisticRegression()
       .setMaxIter(10)
@@ -292,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sqlContext.read.format("libsvm").load(path);
+    DataFrame training = sqlContext.read().format("libsvm").load(path);
 
     LinearRegression lr = new LinearRegression()
       .setMaxIter(10)

diff --git a/...les/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/...les/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
@@ -26,9 +26,6 @@
 import org.apache.spark.ml.classification.DecisionTreeClassificationModel;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
 import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 // $example off$
@@ -40,9 +37,8 @@ public static void main(String[] args) {
     SQLContext sqlContext = new SQLContext(jsc);
 
     // $example on$
-    // Load and parse the data file, converting it to a DataFrame.
-    RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
-    DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+    // Load the data stored in LIBSVM format as a DataFrame.
+    DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
     // Index labels, adding metadata to the label column.
     // Fit on whole dataset to include all labels in index.

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
@@ -27,9 +27,6 @@
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
 import org.apache.spark.ml.regression.DecisionTreeRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 // $example off$
@@ -40,9 +37,9 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext sqlContext = new SQLContext(jsc);
     // $example on$
-    // Load and parse the data file, converting it to a DataFrame.
-    RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
-    DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+    // Load the data stored in LIBSVM format as a DataFrame.
+    DataFrame data = sqlContext.read().format("libsvm")
+      .load("data/mllib/sample_libsvm_data.txt");
 
     // Automatically identify categorical features, and index them.
     // Set maxCategories so features with > 4 distinct values are treated as continuous.

diff --git a/...src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/...src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -21,12 +21,9 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SQLContext;
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
 import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 // $example off$
 
@@ -43,8 +40,7 @@ public static void main(String[] args) {
     // $example on$
     // Load training data
     String path = "data/mllib/sample_multiclass_classification_data.txt";
-    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
-    DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+    DataFrame dataFrame = jsql.read().format("libsvm").load(path);
     // Split the data into train and test
     DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
     DataFrame train = splits[0];

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -27,9 +27,7 @@
 import org.apache.spark.ml.util.MetadataUtils;
 import org.apache.spark.mllib.evaluation.MulticlassMetrics;
 import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.types.StructField;
@@ -80,31 +78,30 @@ public static void main(String[] args) {
     OneVsRest ovr = new OneVsRest().setClassifier(classifier);
 
     String input = params.input;
-    RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
-    RDD<LabeledPoint> train;
-    RDD<LabeledPoint> test;
+    DataFrame inputData = jsql.read().format("libsvm").load(input);
+    DataFrame train;
+    DataFrame test;
 
     // compute the train/ test split: if testInput is not provided use part of input
     String testInput = params.testInput;
     if (testInput != null) {
       train = inputData;
       // compute the number of features in the training set.
-      int numFeatures = inputData.first().features().size();
-      test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
+      int numFeatures = inputData.first().<Vector>getAs(1).size();
+      test = jsql.read().format("libsvm").option("numFeatures",
+        String.valueOf(numFeatures)).load(testInput);
     } else {
       double f = params.fracTest;
-      RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
+      DataFrame[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
       train = tmp[0];
       test = tmp[1];
     }
 
     // train the multiclass model
-    DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
-    OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());
+    OneVsRestModel ovrModel = ovr.fit(train.cache());
 
     // score the model on test data
-    DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
-    DataFrame predictions = ovrModel.transform(testDataFrame.cache())
+    DataFrame predictions = ovrModel.transform(test.cache())
       .select("prediction", "label");
 
     // obtain metrics

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -23,8 +23,6 @@
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.regression.LinearRegression;
 import org.apache.spark.ml.tuning.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 
@@ -46,9 +44,7 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
 
-    DataFrame data = jsql.createDataFrame(
-      MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
-      LabeledPoint.class);
+    DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
     // Prepare training and test data.
     DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);

diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -28,16 +28,15 @@
 from pyspark.ml.classification import DecisionTreeClassifier
 from pyspark.ml.feature import StringIndexer, VectorIndexer
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="decision_tree_classification_example")
     sqlContext = SQLContext(sc)
 
     # $example on$
-    # Load and parse the data file, converting it to a DataFrame.
-    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    # Load the data stored in LIBSVM format as a DataFrame.
+    data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     # Index labels, adding metadata to the label column.
     # Fit on whole dataset to include all labels in index.

diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -28,16 +28,15 @@
 from pyspark.ml.regression import DecisionTreeRegressor
 from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="decision_tree_classification_example")
     sqlContext = SQLContext(sc)
 
     # $example on$
-    # Load and parse the data file, converting it to a DataFrame.
-    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    # Load the data stored in LIBSVM format as a DataFrame.
+    data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     # Automatically identify categorical features, and index them.
     # We specify maxCategories so features with > 4 distinct values are treated as continuous.

diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py
@@ -24,7 +24,6 @@
 from pyspark.ml.feature import StringIndexer
 from pyspark.ml.regression import GBTRegressor
 from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics
-from pyspark.mllib.util import MLUtils
 from pyspark.sql import Row, SQLContext
 
 """
@@ -70,8 +69,8 @@ def testRegression(train, test):
     sc = SparkContext(appName="PythonGBTExample")
     sqlContext = SQLContext(sc)
 
-    # Load and parse the data file into a dataframe.
-    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    # Load the data stored in LIBSVM format as a DataFrame.
+    df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     # Map labels into an indexed column of labels in [0, numLabels)
     stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

diff --git a/examples/src/main/python/ml/logistic_regression.py b/examples/src/main/python/ml/logistic_regression.py
@@ -23,7 +23,6 @@
 from pyspark.ml.classification import LogisticRegression
 from pyspark.mllib.evaluation import MulticlassMetrics
 from pyspark.ml.feature import StringIndexer
-from pyspark.mllib.util import MLUtils
 from pyspark.sql import SQLContext
 
 """
@@ -41,8 +40,8 @@
     sc = SparkContext(appName="PythonLogisticRegressionExample")
     sqlContext = SQLContext(sc)
 
-    # Load and parse the data file into a dataframe.
-    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    # Load the data stored in LIBSVM format as a DataFrame.
+    df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     # Map labels into an indexed column of labels in [0, numLabels)
     stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -22,7 +22,6 @@
 # $example on$
 from pyspark.ml.classification import MultilayerPerceptronClassifier
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 # $example off$
 
 if __name__ == "__main__":
@@ -32,8 +31,8 @@
 
     # $example on$
     # Load training data
-    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")\
-        .toDF()
+    data = sqlContext.read.format("libsvm")\
+        .load("data/mllib/sample_multiclass_classification_data.txt")
     # Split the data into train and test
     splits = data.randomSplit([0.6, 0.4], 1234)
     train = splits[0]

diff --git a/examples/src/main/python/ml/random_forest_example.py b/examples/src/main/python/ml/random_forest_example.py
@@ -74,8 +74,8 @@ def testRegression(train, test):
     sc = SparkContext(appName="PythonRandomForestExample")
     sqlContext = SQLContext(sc)
 
-    # Load and parse the data file into a dataframe.
-    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    # Load the data stored in LIBSVM format as a DataFrame.
+    df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     # Map labels into an indexed column of labels in [0, numLabels)
     stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -26,18 +26,16 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier
 import org.apache.spark.ml.classification.DecisionTreeClassificationModel
 import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
 // $example off$
 
 object DecisionTreeClassificationExample {
   def main(args: Array[String]): Unit = {
     val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-    import sqlContext.implicits._
     // $example on$
-    // Load and parse the data file, converting it to a DataFrame.
-    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    // Load the data stored in LIBSVM format as a DataFrame.
+    val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
     // Index labels, adding metadata to the label column.
     // Fit on whole dataset to include all labels in index.