[SPARK-21024][SQL] CSV parse mode handles Univocity parser exceptions #18250

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

maropu wants to merge 2 commits into apache:master from maropu:SPARK-21024

-Original file line number
+Diff line change
@@ Expand Up / @@ -188,7 +188,15 @@ class UnivocityParser( @@
        * Parses a single CSV string and turns it into either one resulting row or no row (if the
        * the record is malformed).
        */
-      def parse(input: String): InternalRow = convert(tokenizer.parseLine(input))
+      def parse(input: String): InternalRow = {
+        val parsedTokens = try {
+          tokenizer.parseLine(input)
+        } catch {
+          case NonFatal(e) =>
+            throw BadRecordException(() => getCurrentInput, () => None, e)
+        }
+        convert(parsedTokens)
+      }
       private def convert(tokens: Array[String]): InternalRow = {
         if (tokens.length != schema.length) {
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
             }
           }
       }
+      test("SPARK-21024 CSV parser mode controls parser exceptions") {
+        withTempPath { path =>
+          Seq("0,1", "0,1,2,3").toDF().write.text(path.getAbsolutePath)
+          Seq(false).foreach { wholeFile =>
+            val msg = intercept[SparkException] {
+              spark.read.format("csv")
+                .schema("a INT, b INT")
+                .option("maxColumns", "2")
+                .option("mode", "FAILFAST")
+                .option("wholeFile", wholeFile)
+                .load(path.getAbsolutePath)
+                .collect
+            }.getMessage
+            assert(msg.contains("Number of columns processed may have exceeded limit of 2 columns."))
+            val columnNameOfCorruptRecord = "_unparsed"
+            val df = spark.read.format("csv")
+              .schema(s"a INT, b INT, $columnNameOfCorruptRecord STRING")
+              .option("maxColumns", "2")
+              .option("mode", "PERMISSIVE")
+              .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
+              .option("wholeFile", wholeFile)
+              .load(path.getAbsolutePath)
+            checkAnswer(df, Row(0, 1, null) :: Row(null, null, "0,1,2,") :: Nil)
+          }
+        }
+      }
     }

Provide feedback