From ff249c3bad2d9a42e42934ebb44d9d6132f49fb8 Mon Sep 17 00:00:00 2001 From: Jiaan Geng Date: Wed, 24 Nov 2021 16:17:43 +0800 Subject: [PATCH] [SPARK-37389][SQL] Check unclosed bracketed comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? The SQL below has unclosed bracketed comment. ``` /*abc*/ select 1 as a /* 2 as b /*abc*/ , 3 as c /**/ ; ``` But Spark outputs: a | -- | 1 | PostgreSQL also supports the feature, but output: ``` SQL 错误 [42601]: Unterminated block comment started at position 22 in SQL /*abc*/ select 1 as a /* 2 as b /*abc*/ , 3 as c /**/ ;. Expected */ sequence ``` ### Why are the changes needed? The execute plan is not expected, if we don't check unclosed bracketed comments. ### Does this PR introduce _any_ user-facing change? 'Yes'. The behavior of bracketed comments will more correctly. ### How was this patch tested? New tests. Closes #34668 from beliefer/SPARK-37389. Authored-by: Jiaan Geng Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/SqlBase.g4 | 17 ++++- .../sql/catalyst/parser/ParseDriver.scala | 56 ++++++++++++++++ .../spark/sql/errors/QueryParsingErrors.scala | 4 ++ .../sql/catalyst/parser/PlanParserSuite.scala | 50 +++++++++++++++ .../resources/sql-tests/inputs/comments.sql | 29 +++++++++ .../sql-tests/results/comments.sql.out | 64 ++++++++++++++++++- 6 files changed, 218 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 426f529c93055..bd2dd0801bb82 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -36,6 +36,11 @@ grammar SqlBase; } @lexer::members { + /** + * When true, parser should throw ParseExcetion for unclosed bracketed comment. + */ + public boolean has_unclosed_bracketed_comment = false; + /** * Verify whether current token is a valid decimal token (which contains dot). * Returns true if the character that follows the token is not a digit or letter or underscore. @@ -73,6 +78,16 @@ grammar SqlBase; return false; } } + + /** + * This method will be called when the character stream ends and try to find out the + * unclosed bracketed comment. + * If the method be called, it means the end of the entire character stream match, + * and we set the flag and fail later. + */ + public void markUnclosedComment() { + has_unclosed_bracketed_comment = true; + } } singleStatement @@ -1926,7 +1941,7 @@ SIMPLE_COMMENT ; BRACKETED_COMMENT - : '/*' {!isHint()}? (BRACKETED_COMMENT|.)*? '*/' -> channel(HIDDEN) + : '/*' {!isHint()}? ( BRACKETED_COMMENT | . )*? ('*/' | {markUnclosedComment();} EOF) -> channel(HIDDEN) ; WS diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index f53c0d36f2779..7917d42e3666f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -101,6 +101,7 @@ abstract class AbstractSqlParser extends ParserInterface with SQLConfHelper with val tokenStream = new CommonTokenStream(lexer) val parser = new SqlBaseParser(tokenStream) parser.addParseListener(PostProcessor) + parser.addParseListener(UnclosedCommentProcessor(command, tokenStream)) parser.removeErrorListeners() parser.addErrorListener(ParseErrorListener) parser.legacy_setops_precedence_enabled = conf.setOpsPrecedenceEnforced @@ -313,3 +314,58 @@ case object PostProcessor extends SqlBaseBaseListener { parent.addChild(new TerminalNodeImpl(f(newToken))) } } + +/** + * The post-processor checks the unclosed bracketed comment. + */ +case class UnclosedCommentProcessor( + command: String, tokenStream: CommonTokenStream) extends SqlBaseBaseListener { + + override def exitSingleDataType(ctx: SqlBaseParser.SingleDataTypeContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleExpression(ctx: SqlBaseParser.SingleExpressionContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleTableIdentifier(ctx: SqlBaseParser.SingleTableIdentifierContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleFunctionIdentifier( + ctx: SqlBaseParser.SingleFunctionIdentifierContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleMultipartIdentifier( + ctx: SqlBaseParser.SingleMultipartIdentifierContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleTableSchema(ctx: SqlBaseParser.SingleTableSchemaContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitQuery(ctx: SqlBaseParser.QueryContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + override def exitSingleStatement(ctx: SqlBaseParser.SingleStatementContext): Unit = { + checkUnclosedComment(tokenStream, command) + } + + /** check `has_unclosed_bracketed_comment` to find out the unclosed bracketed comment. */ + private def checkUnclosedComment(tokenStream: CommonTokenStream, command: String) = { + assert(tokenStream.getTokenSource.isInstanceOf[SqlBaseLexer]) + val lexer = tokenStream.getTokenSource.asInstanceOf[SqlBaseLexer] + if (lexer.has_unclosed_bracketed_comment) { + // The last token is 'EOF' and the penultimate is unclosed bracketed comment + val failedToken = tokenStream.get(tokenStream.size() - 2) + assert(failedToken.getType() == SqlBaseParser.BRACKETED_COMMENT) + val position = Origin(Option(failedToken.getLine), Option(failedToken.getCharPositionInLine)) + throw QueryParsingErrors.unclosedBracketedCommentError(command, position) + } + } + +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala index 090f73d192dd5..486094b63da9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala @@ -425,4 +425,8 @@ object QueryParsingErrors { new ParseException( s"Specifying a database in CREATE TEMPORARY FUNCTION is not allowed: '$databaseName'", ctx) } + + def unclosedBracketedCommentError(command: String, position: Origin): Throwable = { + new ParseException(Some(command), "Unclosed bracketed comment", position, position) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index ebafb9db1be32..9f9eb537d1db0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -160,6 +160,56 @@ class PlanParserSuite extends AnalysisTest { """.stripMargin, plan) } + test("nested bracketed comment case seven") { + val plan = OneRowRelation().select(Literal(1).as("a")) + assertEqual( + """ + |/*abc*/ + |select 1 as a + |/* + | + |2 as b + |/*abc */ + |, 3 as c + | + |/**/ + |*/ + """.stripMargin, plan) + } + + test("unclosed bracketed comment one") { + val query = """ + |/*abc*/ + |select 1 as a + |/* + | + |2 as b + |/*abc */ + |, 3 as c + | + |/**/ + |""".stripMargin + val e = intercept[ParseException](parsePlan(query)) + assert(e.getMessage.contains(s"Unclosed bracketed comment")) + } + + test("unclosed bracketed comment two") { + val query = """ + |/*abc*/ + |select 1 as a + |/* + | + |2 as b + |/*abc */ + |, 3 as c + | + |/**/ + |select 4 as d + |""".stripMargin + val e = intercept[ParseException](parsePlan(query)) + assert(e.getMessage.contains(s"Unclosed bracketed comment")) + } + test("case insensitive") { val plan = table("a").select(star()) assertEqual("sELEct * FroM a", plan) diff --git a/sql/core/src/test/resources/sql-tests/inputs/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/comments.sql index 19f11de22dfd1..da5e57a942920 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/comments.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/comments.sql @@ -88,3 +88,32 @@ Other information of first level. /*/**/*/ SELECT 'selected content' AS tenth; --QUERY-DELIMITER-END + +-- the first case of unclosed bracketed comment +--QUERY-DELIMITER-START +/*abc*/ +select 1 as a +/* + +2 as b +/*abc*/ +, 3 as c + +/**/ +; +--QUERY-DELIMITER-END + +-- the second case of unclosed bracketed comment +--QUERY-DELIMITER-START +/*abc*/ +select 1 as a +/* + +2 as b +/*abc*/ +, 3 as c + +/**/ +select 4 as d +; +--QUERY-DELIMITER-END diff --git a/sql/core/src/test/resources/sql-tests/results/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/comments.sql.out index fd58a33595fe6..da9dbd5fa32d8 100644 --- a/sql/core/src/test/resources/sql-tests/results/comments.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/comments.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 10 +-- Number of queries: 12 -- !query @@ -119,3 +119,65 @@ SELECT 'selected content' AS tenth struct -- !query output selected content + + +-- !query +/*abc*/ +select 1 as a +/* + +2 as b +/*abc*/ +, 3 as c + +/**/ +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Unclosed bracketed comment(line 3, pos 0) + +== SQL == +/*abc*/ +select 1 as a +/* +^^^ + +2 as b +/*abc*/ +, 3 as c + +/**/ + + +-- !query +/*abc*/ +select 1 as a +/* + +2 as b +/*abc*/ +, 3 as c + +/**/ +select 4 as d +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Unclosed bracketed comment(line 3, pos 0) + +== SQL == +/*abc*/ +select 1 as a +/* +^^^ + +2 as b +/*abc*/ +, 3 as c + +/**/ +select 4 as d