Address comments

apache · May 29, 2018 · e90fa00 · e90fa00
1 parent 8ffba61
commit e90fa00
Showing 1 changed file with 37 additions and 33 deletions.
diff --git a/...re/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala b/...re/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
@@ -55,44 +55,43 @@ object DataSourceWriteBenchmark {
     }
   }
 
-  def writeInt(table: String, format: String, benchmark: Benchmark): Unit = {
-    spark.sql(s"create table $table(c1 INT, c2 STRING) using $format")
-    benchmark.addCase("Output Single Int Column") { _ =>
-      spark.sql(s"INSERT overwrite table $table select cast(id as INT) as " +
-        s"c1, cast(id as STRING) as c2 from $tempTable")
+  def writeNumeric(table: String, format: String, benchmark: Benchmark, dataType: String): Unit = {
+    spark.sql(s"create table $table(id $dataType) using $format")
+    benchmark.addCase(s"Output Single $dataType Column") { _ =>
+      spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS $dataType) AS c1 FROM $tempTable")
     }
   }
 
   def writeIntString(table: String, format: String, benchmark: Benchmark): Unit = {
-    spark.sql(s"create table $table(c1 INT, c2 STRING) using $format")
+    spark.sql(s"CREATE TABLE $table(c1 INT, c2 STRING) USING $format")
     benchmark.addCase("Output Int and String Column") { _ =>
-      spark.sql(s"INSERT overwrite table $table select cast(id as INT) as " +
-        s"c1, cast(id as STRING) as c2 from $tempTable")
+      spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS " +
+        s"c1, CAST(id AS STRING) AS c2 FROM $tempTable")
     }
   }
 
   def writePartition(table: String, format: String, benchmark: Benchmark): Unit = {
-    spark.sql(s"create table $table(p INT, id INT) using $format PARTITIONED BY (p)")
+    spark.sql(s"CREATE TABLE $table(p INT, id INT) USING $format PARTITIONED BY (p)")
     benchmark.addCase("Output Partitions") { _ =>
-      spark.sql(s"INSERT overwrite table $table select cast(id as INT) as id," +
-        s" cast(id % 2 as INT) as p from $tempTable")
+      spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS id," +
+        s" CAST(id % 2 AS INT) AS p FROM $tempTable")
     }
   }
 
   def writeBucket(table: String, format: String, benchmark: Benchmark): Unit = {
-    spark.sql(s"create table $table(c1 INT, c2 INT) using $format CLUSTERED BY (c2) INTO 2 BUCKETS")
+    spark.sql(s"CREATE TABLE $table(c1 INT, c2 INT) USING $format CLUSTERED BY (c2) INTO 2 BUCKETS")
     benchmark.addCase("Output Buckets") { _ =>
-      spark.sql(s"INSERT overwrite table $table select cast(id as INT) as " +
-        s"c1, cast(id as INT) as c2 from $tempTable")
+      spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS " +
+        s"c1, CAST(id AS INT) AS c2 FROM $tempTable")
     }
   }
 
   def main(args: Array[String]): Unit = {
     val tableInt = "tableInt"
+    val tableDouble = "tableDouble"
     val tableIntString = "tableIntString"
     val tablePartition = "tablePartition"
     val tableBucket = "tableBucket"
-    // If the
     val formats: Seq[String] = if (args.isEmpty) {
       Seq("Parquet", "ORC", "JSON", "CSV")
     } else {
@@ -102,38 +101,43 @@ object DataSourceWriteBenchmark {
     Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
     Parquet writer benchmark:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      6054 / 6070          2.6         384.9       1.0X
-    Output Int and String Column                  5784 / 5800          2.7         367.8       1.0X
-    Output Partitions                             3891 / 3904          4.0         247.4       1.6X
-    Output Buckets                                5446 / 5729          2.9         346.2       1.1X
+    Output Single Int Column                      1815 / 1932          8.7         115.4       1.0X
+    Output Single Double Column                   1877 / 1878          8.4         119.3       1.0X
+    Output Int and String Column                  6265 / 6543          2.5         398.3       0.3X
+    Output Partitions                             4067 / 4457          3.9         258.6       0.4X
+    Output Buckets                                5608 / 5820          2.8         356.6       0.3X
 
     ORC writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      5734 / 5823          2.7         364.6       1.0X
-    Output Int and String Column                  5802 / 5839          2.7         368.9       1.0X
-    Output Partitions                             3384 / 3671          4.6         215.1       1.7X
-    Output Buckets                                4950 / 4988          3.2         314.7       1.2X
+    Output Single Int Column                      1201 / 1239         13.1          76.3       1.0X
+    Output Single Double Column                   1542 / 1600         10.2          98.0       0.8X
+    Output Int and String Column                  6495 / 6580          2.4         412.9       0.2X
+    Output Partitions                             3648 / 3842          4.3         231.9       0.3X
+    Output Buckets                                5022 / 5145          3.1         319.3       0.2X
 
     JSON writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      5576 / 5594          2.8         354.5       1.0X
-    Output Int and String Column                  5550 / 5620          2.8         352.9       1.0X
-    Output Partitions                             3727 / 4100          4.2         237.0       1.5X
-    Output Buckets                                5316 / 5852          3.0         338.0       1.0X
+    Output Single Int Column                      1988 / 2093          7.9         126.4       1.0X
+    Output Single Double Column                   2854 / 2911          5.5         181.4       0.7X
+    Output Int and String Column                  6467 / 6653          2.4         411.1       0.3X
+    Output Partitions                             4548 / 5055          3.5         289.1       0.4X
+    Output Buckets                                5664 / 5765          2.8         360.1       0.4X
 
     CSV writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      7064 / 8714          2.2         449.1       1.0X
-    Output Int and String Column                  7114 / 7663          2.2         452.3       1.0X
-    Output Partitions                             5771 / 6228          2.7         366.9       1.2X
-    Output Buckets                                7414 / 7479          2.1         471.3       1.0X
+    Output Single Int Column                      3025 / 3190          5.2         192.3       1.0X
+    Output Single Double Column                   3575 / 3634          4.4         227.3       0.8X
+    Output Int and String Column                  7313 / 7399          2.2         464.9       0.4X
+    Output Partitions                             5105 / 5190          3.1         324.6       0.6X
+    Output Buckets                                6986 / 6992          2.3         444.1       0.4X
     */
     withTempTable(tempTable) {
       spark.range(numRows).createOrReplaceTempView(tempTable)
       formats.foreach { format =>
-        withTable(tableInt, tableIntString, tablePartition, tableBucket) {
+        withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) {
           val benchmark = new Benchmark(s"$format writer benchmark", numRows)
-          writeInt(tableInt, format, benchmark)
+          writeNumeric(tableInt, format, benchmark, "Int")
+          writeNumeric(tableDouble, format, benchmark, "Double")
           writeIntString(tableIntString, format, benchmark)
           writePartition(tablePartition, format, benchmark)
           writeBucket(tableBucket, format, benchmark)