From dd16ca349dd20626b6a8ae00f6dd445c269104b8 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 6 Oct 2018 11:12:33 +0200 Subject: [PATCH 1/5] Test for backward slash as the delimiter --- .../spark/sql/execution/datasources/csv/CSVSuite.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index f70df0bcecde7..87b22ea4a4851 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1820,4 +1820,10 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te checkAnswer(spark.read.option("multiLine", true).schema(schema).csv(input), Row(null)) assert(spark.read.csv(input).collect().toSet == Set(Row())) } + + test("using the backward slash as the delimiter") { + val input = Seq("""abc\1""").toDS() + val df = spark.read.option("delimiter", "\\").csv(input) + checkAnswer(df, Row("abc", "1")) + } } From 7bf453a5a5ddedda4319d114bbbb6b8c125fa0d4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 6 Oct 2018 11:55:15 +0200 Subject: [PATCH 2/5] Bug fix + tests --- .../execution/datasources/csv/CSVUtils.scala | 32 +++++++++---------- .../execution/datasources/csv/CSVSuite.scala | 7 ++-- .../datasources/csv/CSVUtilsSuite.scala | 10 ++++++ 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 7ce65fa89b02d..87503e36f53c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -97,23 +97,21 @@ object CSVUtils { */ @throws[IllegalArgumentException] def toChar(str: String): Char = { - if (str.charAt(0) == '\\') { - str.charAt(1) - match { - case 't' => '\t' - case 'r' => '\r' - case 'b' => '\b' - case 'f' => '\f' - case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options - case '\'' => '\'' - case 'u' if str == """\u0000""" => '\u0000' - case _ => - throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") - } - } else if (str.length == 1) { - str.charAt(0) - } else { - throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") + (str: Seq[Char]) match { + case Seq() => throw new IllegalArgumentException(s"Delimiter cannot be empty string") + case Seq(c) => c + case Seq('\\', 't') => '\t' + case Seq('\\', 'r') => '\r' + case Seq('\\', 'b') => '\b' + case Seq('\\', 'f') => '\f' + // In case user changes quote char and uses \" as delimiter in options + case Seq('\\', '\"') => '\"' + case Seq('\\', '\'') => '\'' + case _ if str == """\u0000""" => '\u0000' + case Seq('\\', _) => + throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") + case _ => + throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 87b22ea4a4851..5be783ad6c878 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1823,7 +1823,10 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te test("using the backward slash as the delimiter") { val input = Seq("""abc\1""").toDS() - val df = spark.read.option("delimiter", "\\").csv(input) - checkAnswer(df, Row("abc", "1")) + checkAnswer(spark.read.option("delimiter", "\\").csv(input), Row("abc", "1")) + checkAnswer(spark.read.option("inferSchema", true).option("delimiter", "\\").csv(input), + Row("abc", 1)) + val schema = new StructType().add("a", StringType).add("b", IntegerType) + checkAnswer(spark.read.schema(schema).option("delimiter", "\\").csv(input), Row("abc", 1)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala index 221e44ce2cff6..12ff81e25cc21 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala @@ -44,4 +44,14 @@ class CSVUtilsSuite extends SparkFunSuite { assert(exception.getMessage.contains("Unsupported special character for delimiter")) } + test("string with one backward slash should be converted to the backward slash char") { + assert(CSVUtils.toChar("""\""") == '\\') + } + + test("output proper error message for empty string") { + val exception = intercept[IllegalArgumentException]{ + CSVUtils.toChar("") + } + assert(exception.getMessage.contains("Delimiter cannot be empty string")) + } } From 728aac2abf4023e08f87c0bb5ba541f250ab6537 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 9 Oct 2018 11:24:38 +0200 Subject: [PATCH 3/5] Removing string interpolation --- .../apache/spark/sql/execution/datasources/csv/CSVUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 87503e36f53c7..f4e46f5fe27e3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -98,7 +98,7 @@ object CSVUtils { @throws[IllegalArgumentException] def toChar(str: String): Char = { (str: Seq[Char]) match { - case Seq() => throw new IllegalArgumentException(s"Delimiter cannot be empty string") + case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string") case Seq(c) => c case Seq('\\', 't') => '\t' case Seq('\\', 'r') => '\r' From 1c2ac25c4c225ab63591976bbd0eec866ae40d06 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 9 Oct 2018 12:04:49 +0200 Subject: [PATCH 4/5] Support backslash escaped by backslash --- .../apache/spark/sql/execution/datasources/csv/CSVUtils.scala | 1 + .../spark/sql/execution/datasources/csv/CSVUtilsSuite.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index f4e46f5fe27e3..9e6f67cae6752 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -107,6 +107,7 @@ object CSVUtils { // In case user changes quote char and uses \" as delimiter in options case Seq('\\', '\"') => '\"' case Seq('\\', '\'') => '\'' + case Seq('\\', '\\') => '\\' case _ if str == """\u0000""" => '\u0000' case Seq('\\', _) => throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala index 12ff81e25cc21..670431ac2d57e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala @@ -28,6 +28,7 @@ class CSVUtilsSuite extends SparkFunSuite { assert(CSVUtils.toChar("""\"""") === '\"') assert(CSVUtils.toChar("""\'""") === '\'') assert(CSVUtils.toChar("""\u0000""") === '\u0000') + assert(CSVUtils.toChar("""\\""") === '\\') } test("Does not accept delimiter larger than one character") { From 20856b4132cbc6aa34484144112f3463e47c4906 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 12 Oct 2018 16:12:09 +0200 Subject: [PATCH 5/5] Prohibit single backslash --- .../spark/sql/execution/datasources/csv/CSVUtils.scala | 3 +++ .../spark/sql/execution/datasources/csv/CSVSuite.scala | 7 ++++--- .../sql/execution/datasources/csv/CSVUtilsSuite.scala | 7 +++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 9e6f67cae6752..b367b3d0ac642 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -99,6 +99,9 @@ object CSVUtils { def toChar(str: String): Char = { (str: Seq[Char]) match { case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string") + case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." + + " It has special meaning as beginning of an escape sequence." + + " To get the backslash character, pass a string with two backslashes as the delimiter.") case Seq(c) => c case Seq('\\', 't') => '\t' case Seq('\\', 'r') => '\r' diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 60b49c58cfffe..d59035b716cf0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1829,10 +1829,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te test("using the backward slash as the delimiter") { val input = Seq("""abc\1""").toDS() - checkAnswer(spark.read.option("delimiter", "\\").csv(input), Row("abc", "1")) - checkAnswer(spark.read.option("inferSchema", true).option("delimiter", "\\").csv(input), + val delimiter = """\\""" + checkAnswer(spark.read.option("delimiter", delimiter).csv(input), Row("abc", "1")) + checkAnswer(spark.read.option("inferSchema", true).option("delimiter", delimiter).csv(input), Row("abc", 1)) val schema = new StructType().add("a", StringType).add("b", IntegerType) - checkAnswer(spark.read.schema(schema).option("delimiter", "\\").csv(input), Row("abc", 1)) + checkAnswer(spark.read.schema(schema).option("delimiter", delimiter).csv(input), Row("abc", 1)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala index 670431ac2d57e..60fcbd2ff008c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala @@ -45,8 +45,11 @@ class CSVUtilsSuite extends SparkFunSuite { assert(exception.getMessage.contains("Unsupported special character for delimiter")) } - test("string with one backward slash should be converted to the backward slash char") { - assert(CSVUtils.toChar("""\""") == '\\') + test("string with one backward slash is prohibited") { + val exception = intercept[IllegalArgumentException]{ + CSVUtils.toChar("""\""") + } + assert(exception.getMessage.contains("Single backslash is prohibited")) } test("output proper error message for empty string") {