From 1b45c368c5c85bb72f89424069f1e31fa7800107 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 23 Sep 2015 11:22:06 -0700 Subject: [PATCH 1/3] Create test dataframes from lists of rows instead of RDDs of rows in Java tests in mllib/ml --- .../ml/classification/JavaNaiveBayesSuite.java | 7 ++++--- .../spark/ml/feature/JavaBucketizerSuite.java | 14 +++++++------- .../org/apache/spark/ml/feature/JavaDCTSuite.java | 11 +++++------ .../spark/ml/feature/JavaHashingTFSuite.java | 7 ++++--- .../ml/feature/JavaPolynomialExpansionSuite.java | 5 +++-- .../ml/feature/JavaStopWordsRemoverSuite.java | 7 ++++--- .../spark/ml/feature/JavaStringIndexerSuite.java | 7 ++++--- .../spark/ml/feature/JavaVectorAssemblerSuite.java | 3 +-- .../spark/ml/feature/JavaVectorSlicerSuite.java | 7 ++++--- .../apache/spark/ml/feature/JavaWord2VecSuite.java | 12 ++++++------ 10 files changed, 42 insertions(+), 38 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java index 075a62c493f1..b165d60e3fd2 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java @@ -19,6 +19,7 @@ import java.io.Serializable; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Before; @@ -75,21 +76,21 @@ public void naiveBayesDefaultParams() { @Test public void testNaiveBayes() { - JavaRDD jrdd = jsc.parallelize(Arrays.asList( + List data = Arrays.asList( RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)), RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0)) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(jrdd, schema); + DataFrame dataset = jsql.createDataFrame(data, schema); NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial"); NaiveBayesModel model = nb.fit(dataset); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java index 47d68de599da..84801b7a3d44 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java @@ -55,16 +55,16 @@ public void tearDown() { public void bucketizerTest() { double[] splits = {-0.5, 0.0, 0.5}; - JavaRDD data = jsc.parallelize(Arrays.asList( - RowFactory.create(-0.5), - RowFactory.create(-0.3), - RowFactory.create(0.0), - RowFactory.create(0.2) - )); StructType schema = new StructType(new StructField[] { new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(data, schema); + DataFrame dataset = jsql.createDataFrame( + Arrays.asList( + RowFactory.create(-0.5), + RowFactory.create(-0.3), + RowFactory.create(0.0), + RowFactory.create(0.2)), + schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("feature") diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java index 0f6ec64d97d3..0cc55e636a5b 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java @@ -57,12 +57,11 @@ public void tearDown() { @Test public void javaCompatibilityTest() { double[] input = new double[] {1D, 2D, 3D, 4D}; - JavaRDD data = jsc.parallelize(Arrays.asList( - RowFactory.create(Vectors.dense(input)) - )); - DataFrame dataset = jsql.createDataFrame(data, new StructType(new StructField[]{ - new StructField("vec", (new VectorUDT()), false, Metadata.empty()) - })); + DataFrame dataset = jsql.createDataFrame( + Arrays.asList(RowFactory.create(Vectors.dense(input))), + new StructType(new StructField[]{ + new StructField("vec", (new VectorUDT()), false, Metadata.empty()) + })); double[] expectedResult = input.clone(); (new DoubleDCT_1D(input.length)).forward(expectedResult, true); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java index 03dd5369bddf..d12332c2a02a 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Assert; @@ -55,17 +56,17 @@ public void tearDown() { @Test public void hashingTF() { - JavaRDD jrdd = jsc.parallelize(Arrays.asList( + List data = Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame sentenceData = jsql.createDataFrame(jrdd, schema); + DataFrame sentenceData = jsql.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java index 834fedbb59e1..bf8eefd71905 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Assert; @@ -60,7 +61,7 @@ public void polynomialExpansionTest() { .setOutputCol("polyFeatures") .setDegree(3); - JavaRDD data = jsc.parallelize(Arrays.asList( + List data = Arrays.asList( RowFactory.create( Vectors.dense(-2.0, 2.3), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17) @@ -70,7 +71,7 @@ public void polynomialExpansionTest() { Vectors.dense(0.6, -1.1), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331) ) - )); + ); StructType schema = new StructType(new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java index 76cdd0fae84a..848d9f8aa928 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Before; @@ -58,14 +59,14 @@ public void javaCompatibilityTest() { .setInputCol("raw") .setOutputCol("filtered"); - JavaRDD rdd = jsc.parallelize(Arrays.asList( + List data = Arrays.asList( RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) - )); + ); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(rdd, schema); + DataFrame dataset = jsql.createDataFrame(data, schema); remover.transform(dataset).collect(); } diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java index 35b18c5308f6..d7aca700e1ad 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Assert; @@ -56,9 +57,9 @@ public void testStringIndexer() { createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); - JavaRDD rdd = jsc.parallelize( - Arrays.asList(c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c"))); - DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + List data = Arrays.asList( + c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")); + DataFrame dataset = sqlContext.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java index b7c564caad3b..e28377757093 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java @@ -65,8 +65,7 @@ public void testVectorAssembler() { Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L); - JavaRDD rdd = jsc.parallelize(Arrays.asList(row)); - DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + DataFrame dataset = sqlContext.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] {"x", "y", "z", "n"}) .setOutputCol("features"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java index f95336142758..00174e6a683d 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature; import java.util.Arrays; +import java.util.List; import org.junit.After; import org.junit.Assert; @@ -63,12 +64,12 @@ public void vectorSlice() { }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); - JavaRDD jrdd = jsc.parallelize(Arrays.asList( + List data = Arrays.asList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) - )); + ); - DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + DataFrame dataset = jsql.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java index 70f5ad943221..294c85c09af5 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java @@ -51,15 +51,15 @@ public void tearDown() { @Test public void testJavaWord2Vec() { - JavaRDD jrdd = jsc.parallelize(Arrays.asList( - RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), - RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), - RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) - )); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema); + DataFrame documentDF = sqlContext.createDataFrame( + Arrays.asList( + RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), + RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), + RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))), + schema); Word2Vec word2Vec = new Word2Vec() .setInputCol("text") From 4bf4ae67970e838a1aad3bea18238ada9cb89bbe Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 23 Sep 2015 11:34:38 -0700 Subject: [PATCH 2/3] weird indentation fix --- .../apache/spark/ml/classification/JavaNaiveBayesSuite.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java index b165d60e3fd2..f5f690eabd12 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java @@ -82,8 +82,7 @@ public void testNaiveBayes() { RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)), - RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0)) - ); + RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), From a668f0dddbacbe7edfab1550f31db1be88ea6ca8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 23 Sep 2015 15:54:06 -0700 Subject: [PATCH 3/3] Fix indentation issues (accidenlty used different style config) --- .../apache/spark/ml/feature/JavaBucketizerSuite.java | 12 ++++++------ .../org/apache/spark/ml/feature/JavaDCTSuite.java | 8 ++++---- .../spark/ml/feature/JavaStringIndexerSuite.java | 2 +- .../apache/spark/ml/feature/JavaWord2VecSuite.java | 10 +++++----- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java index 84801b7a3d44..8a1e5ef01565 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java @@ -59,12 +59,12 @@ public void bucketizerTest() { new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); DataFrame dataset = jsql.createDataFrame( - Arrays.asList( - RowFactory.create(-0.5), - RowFactory.create(-0.3), - RowFactory.create(0.0), - RowFactory.create(0.2)), - schema); + Arrays.asList( + RowFactory.create(-0.5), + RowFactory.create(-0.3), + RowFactory.create(0.0), + RowFactory.create(0.2)), + schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("feature") diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java index 0cc55e636a5b..39da47381b12 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java @@ -58,10 +58,10 @@ public void tearDown() { public void javaCompatibilityTest() { double[] input = new double[] {1D, 2D, 3D, 4D}; DataFrame dataset = jsql.createDataFrame( - Arrays.asList(RowFactory.create(Vectors.dense(input))), - new StructType(new StructField[]{ - new StructField("vec", (new VectorUDT()), false, Metadata.empty()) - })); + Arrays.asList(RowFactory.create(Vectors.dense(input))), + new StructType(new StructField[]{ + new StructField("vec", (new VectorUDT()), false, Metadata.empty()) + })); double[] expectedResult = input.clone(); (new DoubleDCT_1D(input.length)).forward(expectedResult, true); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java index d7aca700e1ad..6b2c48ef1c34 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java @@ -58,7 +58,7 @@ public void testStringIndexer() { createStructField("label", StringType, false) }); List data = Arrays.asList( - c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")); + c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")); DataFrame dataset = sqlContext.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java index 294c85c09af5..0c0c1c4d12d0 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java @@ -55,11 +55,11 @@ public void testJavaWord2Vec() { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); DataFrame documentDF = sqlContext.createDataFrame( - Arrays.asList( - RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), - RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), - RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))), - schema); + Arrays.asList( + RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), + RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), + RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))), + schema); Word2Vec word2Vec = new Word2Vec() .setInputCol("text")