From 024dfccaf8969edb8f1bc719063d3bc97ebff64b Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 5 Jul 2017 08:35:46 +0900 Subject: [PATCH 1/4] Do not allow partially parsing double and floats via NumberFormat in CSV --- .../datasources/csv/UnivocityParser.scala | 8 ++----- .../execution/datasources/csv/CSVSuite.scala | 21 ++++++++++++++++++ .../csv/UnivocityParserSuite.scala | 22 +++++++++---------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index c3657acb7d867..0e41f3c7aa6b8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -111,9 +111,7 @@ class UnivocityParser( case options.nanValue => Float.NaN case options.negativeInf => Float.NegativeInfinity case options.positiveInf => Float.PositiveInfinity - case datum => - Try(datum.toFloat) - .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue()) + case datum => datum.toFloat } case _: DoubleType => (d: String) => @@ -121,9 +119,7 @@ class UnivocityParser( case options.nanValue => Double.NaN case options.negativeInf => Double.NegativeInfinity case options.positiveInf => Double.PositiveInfinity - case datum => - Try(datum.toDouble) - .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue()) + case datum => datum.toDouble } case _: BooleanType => (d: String) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 89d9b69dec7ef..da9905e2a142b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1174,4 +1174,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } } + + test("Do not partially lose data when parsing float and double") { + val exception = intercept[SparkException] { + spark.read.schema("a DOUBLE") + .option("mode", "FAILFAST") + .csv(Seq("10u12").toDS()) + .collect() + } + assert(exception.getMessage.contains("10u12")) + + val count = spark.read.schema("a FLOAT") + .option("mode", "DROPMALFORMED") + .csv(Seq("10u12").toDS()) + .count() + assert(count == 0) + + val results = spark.read.schema("a FLOAT") + .option("mode", "PERMISSIVE") + .csv(Seq("10u12").toDS()) + checkAnswer(results, Row(null)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala index a74b22a4a88a6..55cdfeff71702 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala @@ -130,17 +130,17 @@ class UnivocityParserSuite extends SparkFunSuite { DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime)) } - test("Float and Double Types are cast without respect to platform default Locale") { - val originalLocale = Locale.getDefault - try { - Locale.setDefault(new Locale("fr", "FR")) - // Would parse as 1.0 in fr-FR - val options = new CSVOptions(Map.empty[String, String], "GMT") - assert(parser.makeConverter("_1", FloatType, options = options).apply("1,00") == 100.0) - assert(parser.makeConverter("_1", DoubleType, options = options).apply("1,00") == 100.0) - } finally { - Locale.setDefault(originalLocale) - } + test("Float and Double Types do not allow partial results") { + val options = new CSVOptions(Map.empty[String, String], "GMT") + var message = intercept[NumberFormatException] { + parser.makeConverter("_1", FloatType, options = options).apply("10u000") + }.getMessage + assert(message.contains("10u000")) + + message = intercept[NumberFormatException] { + parser.makeConverter("_1", DoubleType, options = options).apply("10u000") + }.getMessage + assert(message.contains("10u000")) } test("Float NaN values are parsed correctly") { From 32233dd1d5772b5df57919fabc467a197574fa72 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 5 Jul 2017 09:04:37 +0900 Subject: [PATCH 2/4] Rename the title of the test in CSVSuite --- .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index da9905e2a142b..9c2ee4d69c69c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1175,7 +1175,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } - test("Do not partially lose data when parsing float and double") { + test("SPARK-21263: Invalid float and double are handled correctly in different modes") { val exception = intercept[SparkException] { spark.read.schema("a DOUBLE") .option("mode", "FAILFAST") From a41c028392acf70600f2e882cafdf59ad50def14 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 5 Jul 2017 09:17:13 +0900 Subject: [PATCH 3/4] Rename the title of test in UnivocityParserSuite too --- .../sql/execution/datasources/csv/UnivocityParserSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala index 55cdfeff71702..186f7bf982058 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala @@ -130,7 +130,7 @@ class UnivocityParserSuite extends SparkFunSuite { DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime)) } - test("Float and Double Types do not allow partial results") { + test("Throws exception for casting an invalid string to Float and Double Types") { val options = new CSVOptions(Map.empty[String, String], "GMT") var message = intercept[NumberFormatException] { parser.makeConverter("_1", FloatType, options = options).apply("10u000") From c1967f80e340b374e2d6b0e7b9759d0d61a9df13 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Thu, 6 Jul 2017 11:37:53 +0900 Subject: [PATCH 4/4] Address comments --- .../execution/datasources/csv/CSVSuite.scala | 2 +- .../csv/UnivocityParserSuite.scala | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 9c2ee4d69c69c..487c84f629be3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1182,7 +1182,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .csv(Seq("10u12").toDS()) .collect() } - assert(exception.getMessage.contains("10u12")) + assert(exception.getMessage.contains("""input string: "10u12"""")) val count = spark.read.schema("a FLOAT") .option("mode", "DROPMALFORMED") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala index 186f7bf982058..efbf73534bd19 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala @@ -132,15 +132,16 @@ class UnivocityParserSuite extends SparkFunSuite { test("Throws exception for casting an invalid string to Float and Double Types") { val options = new CSVOptions(Map.empty[String, String], "GMT") - var message = intercept[NumberFormatException] { - parser.makeConverter("_1", FloatType, options = options).apply("10u000") - }.getMessage - assert(message.contains("10u000")) - - message = intercept[NumberFormatException] { - parser.makeConverter("_1", DoubleType, options = options).apply("10u000") - }.getMessage - assert(message.contains("10u000")) + val types = Seq(DoubleType, FloatType) + val input = Seq("10u000", "abc", "1 2/3") + types.foreach { dt => + input.foreach { v => + val message = intercept[NumberFormatException] { + parser.makeConverter("_1", dt, options = options).apply(v) + }.getMessage + assert(message.contains(v)) + } + } } test("Float NaN values are parsed correctly") {