Skip to content

Commit

Permalink
[SPARK-11723][ML][DOC] Use LibSVM data source rather than MLUtils.loa…
Browse files Browse the repository at this point in the history
…dLibSVMFile to load DataFrame

Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame, include:
* Use libSVM data source for all example codes under examples/ml, and remove unused import.
* Use libSVM data source for user guides under ml-*** which were omitted by #8697.
* Fix bug: We should use ```sqlContext.read().format("libsvm").load(path)``` at Java side, but the API doc and user guides misuse as ```sqlContext.read.format("libsvm").load(path)```.
* Code cleanup.

mengxr

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9690 from yanboliang/spark-11723.
  • Loading branch information
yanboliang authored and mengxr committed Nov 13, 2015
1 parent 61a2848 commit 99693fe
Show file tree
Hide file tree
Showing 26 changed files with 79 additions and 130 deletions.
10 changes: 5 additions & 5 deletions docs/ml-ensembles.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.DataFrame;

// Load and parse the data file, converting it to a DataFrame.
DataFrame data = sqlContext.read.format("libsvm")
DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");

// Index labels, adding metadata to the label column.
Expand Down Expand Up @@ -384,7 +384,7 @@ import org.apache.spark.ml.regression.RandomForestRegressor;
import org.apache.spark.sql.DataFrame;

// Load and parse the data file, converting it to a DataFrame.
DataFrame data = sqlContext.read.format("libsvm")
DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");

// Automatically identify categorical features, and index them.
Expand Down Expand Up @@ -640,7 +640,7 @@ import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.DataFrame;

// Load and parse the data file, converting it to a DataFrame.
DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
DataFrame data sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
Expand Down Expand Up @@ -830,7 +830,7 @@ import org.apache.spark.ml.regression.GBTRegressor;
import org.apache.spark.sql.DataFrame;

// Load and parse the data file, converting it to a DataFrame.
DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
Expand Down Expand Up @@ -1000,7 +1000,7 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

DataFrame dataFrame = sqlContext.read.format("libsvm")
DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_multiclass_classification_data.txt");

DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
Expand Down
8 changes: 4 additions & 4 deletions docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,7 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.sql.DataFrame;

DataFrame data = sqlContext.read.format("libsvm")
DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
VectorIndexer indexer = new VectorIndexer()
.setInputCol("features")
Expand Down Expand Up @@ -1187,7 +1187,7 @@ for more details on the API.
import org.apache.spark.ml.feature.Normalizer;
import org.apache.spark.sql.DataFrame;

DataFrame dataFrame = sqlContext.read.format("libsvm")
DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");

// Normalize each Vector using $L^1$ norm.
Expand Down Expand Up @@ -1273,7 +1273,7 @@ import org.apache.spark.ml.feature.StandardScaler;
import org.apache.spark.ml.feature.StandardScalerModel;
import org.apache.spark.sql.DataFrame;

DataFrame dataFrame = sqlContext.read.format("libsvm")
DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
Expand Down Expand Up @@ -1366,7 +1366,7 @@ import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel;
import org.apache.spark.sql.DataFrame;

DataFrame dataFrame = sqlContext.read.format("libsvm")
DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
Expand Down
10 changes: 2 additions & 8 deletions docs/ml-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -867,10 +867,9 @@ The `ParamMap` which produces the best evaluation metric is selected as the best
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.util.MLUtils

// Prepare training and test data.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)

val lr = new LinearRegression()
Expand Down Expand Up @@ -911,14 +910,9 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;

DataFrame data = sqlContext.createDataFrame(
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
LabeledPoint.class);
DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
Expand Down
4 changes: 2 additions & 2 deletions docs/ml-linear-methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt";

// Load training data
DataFrame training = sqlContext.read.format("libsvm").load(path);
DataFrame training = sqlContext.read().format("libsvm").load(path);

LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
Expand Down Expand Up @@ -292,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt";

// Load training data
DataFrame training = sqlContext.read.format("libsvm").load(path);
DataFrame training = sqlContext.read().format("libsvm").load(path);

LinearRegression lr = new LinearRegression()
.setMaxIter(10)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
import org.apache.spark.ml.classification.DecisionTreeClassificationModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
Expand All @@ -40,9 +37,8 @@ public static void main(String[] args) {
SQLContext sqlContext = new SQLContext(jsc);

// $example on$
// Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
// Load the data stored in LIBSVM format as a DataFrame.
DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
Expand All @@ -40,9 +37,9 @@ public static void main(String[] args) {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
// Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
// Load the data stored in LIBSVM format as a DataFrame.
DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,9 @@
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
// $example off$

Expand All @@ -43,8 +40,7 @@ public static void main(String[] args) {
// $example on$
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
DataFrame dataFrame = jsql.read().format("libsvm").load(path);
// Split the data into train and test
DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
DataFrame train = splits[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@
import org.apache.spark.ml.util.MetadataUtils;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructField;
Expand Down Expand Up @@ -80,31 +78,30 @@ public static void main(String[] args) {
OneVsRest ovr = new OneVsRest().setClassifier(classifier);

String input = params.input;
RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
RDD<LabeledPoint> train;
RDD<LabeledPoint> test;
DataFrame inputData = jsql.read().format("libsvm").load(input);
DataFrame train;
DataFrame test;

// compute the train/ test split: if testInput is not provided use part of input
String testInput = params.testInput;
if (testInput != null) {
train = inputData;
// compute the number of features in the training set.
int numFeatures = inputData.first().features().size();
test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
int numFeatures = inputData.first().<Vector>getAs(1).size();
test = jsql.read().format("libsvm").option("numFeatures",
String.valueOf(numFeatures)).load(testInput);
} else {
double f = params.fracTest;
RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
DataFrame[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
train = tmp[0];
test = tmp[1];
}

// train the multiclass model
DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());
OneVsRestModel ovrModel = ovr.fit(train.cache());

// score the model on test data
DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
DataFrame predictions = ovrModel.transform(testDataFrame.cache())
DataFrame predictions = ovrModel.transform(test.cache())
.select("prediction", "label");

// obtain metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

Expand All @@ -46,9 +44,7 @@ public static void main(String[] args) {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

DataFrame data = jsql.createDataFrame(
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
LabeledPoint.class);
DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,15 @@
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
sc = SparkContext(appName="decision_tree_classification_example")
sqlContext = SQLContext(sc)

# $example on$
# Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Load the data stored in LIBSVM format as a DataFrame.
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,15 @@
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
sc = SparkContext(appName="decision_tree_classification_example")
sqlContext = SQLContext(sc)

# $example on$
# Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Load the data stored in LIBSVM format as a DataFrame.
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
Expand Down
5 changes: 2 additions & 3 deletions examples/src/main/python/ml/gradient_boosted_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import GBTRegressor
from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics
from pyspark.mllib.util import MLUtils
from pyspark.sql import Row, SQLContext

"""
Expand Down Expand Up @@ -70,8 +69,8 @@ def testRegression(train, test):
sc = SparkContext(appName="PythonGBTExample")
sqlContext = SQLContext(sc)

# Load and parse the data file into a dataframe.
df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Load the data stored in LIBSVM format as a DataFrame.
df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
Expand Down
5 changes: 2 additions & 3 deletions examples/src/main/python/ml/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.util import MLUtils
from pyspark.sql import SQLContext

"""
Expand All @@ -41,8 +40,8 @@
sc = SparkContext(appName="PythonLogisticRegressionExample")
sqlContext = SQLContext(sc)

# Load and parse the data file into a dataframe.
df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Load the data stored in LIBSVM format as a DataFrame.
df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
Expand All @@ -32,8 +31,8 @@

# $example on$
# Load training data
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")\
.toDF()
data = sqlContext.read.format("libsvm")\
.load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
Expand Down
4 changes: 2 additions & 2 deletions examples/src/main/python/ml/random_forest_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def testRegression(train, test):
sc = SparkContext(appName="PythonRandomForestExample")
sqlContext = SQLContext(sc)

# Load and parse the data file into a dataframe.
df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Load the data stored in LIBSVM format as a DataFrame.
df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,16 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.mllib.util.MLUtils
// $example off$

object DecisionTreeClassificationExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
// $example on$
// Load and parse the data file, converting it to a DataFrame.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
// Load the data stored in LIBSVM format as a DataFrame.
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
Expand Down
Loading

0 comments on commit 99693fe

Please sign in to comment.