From a5a63f3121c532a3cdb7e255031514380d268018 Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Thu, 10 Sep 2015 20:58:41 +0900 Subject: [PATCH 1/6] Update code examples in spark.ml user guide to use LIBSVM data source instead of MLUtils --- docs/ml-ensembles.md | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 62749909e01dc..3ff2c7958ad3a 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -124,7 +124,7 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -199,8 +199,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); -DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -271,7 +270,7 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. @@ -330,7 +329,7 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -393,8 +392,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); -DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -453,7 +451,7 @@ from pyspark.ml.evaluation import RegressionEvaluator from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -579,7 +577,7 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -654,8 +652,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); -DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); +val data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -727,7 +724,7 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. @@ -786,7 +783,7 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -850,8 +847,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); -DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -911,7 +907,7 @@ from pyspark.ml.evaluation import RegressionEvaluator from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -976,9 +972,8 @@ import org.apache.spark.sql.{Row, SQLContext} val sqlContext = new SQLContext(sc) // parse data into dataframe -val data = MLUtils.loadLibSVMFile(sc, - "data/mllib/sample_multiclass_classification_data.txt") -val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3)) +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_multiclass_classification_data.txt") +val Array(train, test) = data.randomSplit(Array(0.7, 0.3)) // instantiate multiclass learner and train val ovr = new OneVsRest().setClassifier(new LogisticRegression) From 0a4db4b0ce6f75ab7ed6a650356705cc132f6920 Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Fri, 11 Sep 2015 10:31:47 +0900 Subject: [PATCH 2/6] fix forgetting to remove MLUtils --- docs/ml-ensembles.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 3ff2c7958ad3a..a2495fc0f7b91 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -1021,10 +1021,8 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); -RDD data = MLUtils.loadLibSVMFile(jsc.sc(), - "data/mllib/sample_multiclass_classification_data.txt"); +DataFrame dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_multiclass_classification_data.txt"); -DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345); DataFrame train = splits[0]; DataFrame test = splits[1]; From 193bddf51a6452c0dd67de2c3f872964467af088 Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Fri, 11 Sep 2015 10:35:40 +0900 Subject: [PATCH 3/6] fix type miss 'val' to 'DataFrame' in Java code --- docs/ml-ensembles.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index a2495fc0f7b91..cd5e3f4785b27 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -199,7 +199,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -392,7 +392,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -652,7 +652,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -val data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -847,7 +847,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. From 8d4c0527f90163caaecfb7f1268d899af927c6fc Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Fri, 11 Sep 2015 10:41:46 +0900 Subject: [PATCH 4/6] remove unused import --- docs/ml-ensembles.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index cd5e3f4785b27..349d99b5bf5ee 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -121,7 +121,6 @@ import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.classification.RandomForestClassificationModel import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator -import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -193,9 +192,6 @@ import org.apache.spark.ml.classification.RandomForestClassifier; import org.apache.spark.ml.classification.RandomForestClassificationModel; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. @@ -267,7 +263,6 @@ from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -326,7 +321,6 @@ import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.regression.RandomForestRegressionModel import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -386,9 +380,6 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.ml.regression.RandomForestRegressor; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. @@ -448,7 +439,6 @@ from pyspark.ml import Pipeline from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -574,7 +564,6 @@ import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.classification.GBTClassificationModel import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator -import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -646,9 +635,6 @@ import org.apache.spark.ml.classification.GBTClassifier; import org.apache.spark.ml.classification.GBTClassificationModel; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. @@ -721,7 +707,6 @@ from pyspark.ml import Pipeline from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -780,7 +765,6 @@ import org.apache.spark.ml.regression.GBTRegressor import org.apache.spark.ml.regression.GBTRegressionModel import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.mllib.util.MLUtils // Load and parse the data file, converting it to a DataFrame. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -841,9 +825,6 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.regression.GBTRegressionModel; import org.apache.spark.ml.regression.GBTRegressor; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. @@ -904,7 +885,6 @@ from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.mllib.util import MLUtils # Load and parse the data file, converting it to a DataFrame. data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") @@ -966,7 +946,6 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie {% highlight scala %} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{Row, SQLContext} val sqlContext = new SQLContext(sc) @@ -1011,9 +990,6 @@ import org.apache.spark.ml.classification.OneVsRest; import org.apache.spark.ml.classification.OneVsRestModel; import org.apache.spark.mllib.evaluation.MulticlassMetrics; import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; From dbf13b7ee18432c92850480f8a09acdb3150573e Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Fri, 11 Sep 2015 10:49:59 +0900 Subject: [PATCH 5/6] insert line feeds in wide line --- docs/ml-ensembles.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 349d99b5bf5ee..58f566c9b4b55 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -195,7 +195,8 @@ import org.apache.spark.ml.feature.*; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -383,7 +384,8 @@ import org.apache.spark.ml.regression.RandomForestRegressor; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -951,7 +953,8 @@ import org.apache.spark.sql.{Row, SQLContext} val sqlContext = new SQLContext(sc) // parse data into dataframe -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_multiclass_classification_data.txt") +val data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_multiclass_classification_data.txt") val Array(train, test) = data.randomSplit(Array(0.7, 0.3)) // instantiate multiclass learner and train @@ -997,7 +1000,8 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); -DataFrame dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_multiclass_classification_data.txt"); +DataFrame dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_multiclass_classification_data.txt"); DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345); DataFrame train = splits[0]; From 9bd663b478f386f0233a93c2218615a7e3c96a34 Mon Sep 17 00:00:00 2001 From: y-shimizu Date: Fri, 11 Sep 2015 17:04:08 +0900 Subject: [PATCH 6/6] update guides, ml-features.md and ml-linear-methods.md --- docs/ml-features.md | 64 ++++++++++++++------------------------- docs/ml-linear-methods.md | 22 ++++---------- 2 files changed, 28 insertions(+), 58 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 58b31a5a5cc47..a414c21b5c280 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1179,9 +1179,9 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
{% highlight scala %} import org.apache.spark.ml.feature.VectorIndexer -import org.apache.spark.mllib.util.MLUtils -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") @@ -1200,16 +1200,12 @@ val indexedData = indexerModel.transform(data) {% highlight java %} import java.util.Map; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; -JavaRDD rdd = MLUtils.loadLibSVMFile(sc.sc(), - "data/mllib/sample_libsvm_data.txt").toJavaRDD(); -DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class); +DataFrame data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); VectorIndexer indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") @@ -1230,9 +1226,9 @@ DataFrame indexedData = indexerModel.transform(data);
{% highlight python %} from pyspark.ml.feature import VectorIndexer -from pyspark.mllib.util import MLUtils -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +data = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) @@ -1253,10 +1249,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
{% highlight scala %} import org.apache.spark.ml.feature.Normalizer -import org.apache.spark.mllib.util.MLUtils -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -val dataFrame = sqlContext.createDataFrame(data) +val dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() @@ -1272,15 +1267,11 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
{% highlight java %} -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.Normalizer; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; -JavaRDD data = - MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); -DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +DataFrame dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() @@ -1297,11 +1288,10 @@ DataFrame lInfNormData =
{% highlight python %} -from pyspark.mllib.util import MLUtils from pyspark.ml.feature import Normalizer -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -dataFrame = sqlContext.createDataFrame(data) +dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) @@ -1335,10 +1325,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
{% highlight scala %} import org.apache.spark.ml.feature.StandardScaler -import org.apache.spark.mllib.util.MLUtils -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -val dataFrame = sqlContext.createDataFrame(data) +val dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -1355,16 +1344,12 @@ val scaledData = scalerModel.transform(dataFrame)
{% highlight java %} -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StandardScaler; import org.apache.spark.ml.feature.StandardScalerModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; -JavaRDD data = - MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); -DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +DataFrame dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -1381,11 +1366,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
{% highlight python %} -from pyspark.mllib.util import MLUtils from pyspark.ml.feature import StandardScaler -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -dataFrame = sqlContext.createDataFrame(data) +dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) @@ -1424,10 +1408,9 @@ More details can be found in the API docs for [MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). {% highlight scala %} import org.apache.spark.ml.feature.MinMaxScaler -import org.apache.spark.mllib.util.MLUtils -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -val dataFrame = sqlContext.createDataFrame(data) +val dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -1448,13 +1431,10 @@ More details can be found in the API docs for import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.MinMaxScaler; import org.apache.spark.ml.feature.MinMaxScalerModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; -JavaRDD data = - MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); -DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +DataFrame dataFrame = sqlContext.read.format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index cdd9d4999fa1b..4e94e2f9c708d 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -59,10 +59,9 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
{% highlight scala %} import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.mllib.util.MLUtils // Load training data -val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val lr = new LogisticRegression() .setMaxIter(10) @@ -81,8 +80,6 @@ println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}") {% highlight java %} import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.sql.DataFrame; @@ -98,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample { String path = "data/mllib/sample_libsvm_data.txt"; // Load training data - DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); + DataFrame training = sqlContext.read.format("libsvm").load(path); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) @@ -118,11 +115,9 @@ public class LogisticRegressionWithElasticNetExample {
{% highlight python %} from pyspark.ml.classification import LogisticRegression -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import MLUtils # Load training data -training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) @@ -251,10 +246,9 @@ regression model and extracting model summary statistics.
{% highlight scala %} import org.apache.spark.ml.regression.LinearRegression -import org.apache.spark.mllib.util.MLUtils // Load training data -val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val lr = new LinearRegression() .setMaxIter(10) @@ -283,8 +277,6 @@ import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.regression.LinearRegressionModel; import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.sql.DataFrame; @@ -300,7 +292,7 @@ public class LinearRegressionWithElasticNetExample { String path = "data/mllib/sample_libsvm_data.txt"; // Load training data - DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); + DataFrame training = sqlContext.read.format("libsvm").load(path); LinearRegression lr = new LinearRegression() .setMaxIter(10) @@ -329,11 +321,9 @@ public class LinearRegressionWithElasticNetExample { {% highlight python %} from pyspark.ml.regression import LinearRegression -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import MLUtils # Load training data -training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)