From 9a9fd5291fd070bf9df224b8b56807cf56d35073 Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 16:59:51 -0700 Subject: [PATCH 1/5] Correctly tokenize nested comments in Databricks --- src/dialect/databricks.rs | 5 ++ src/tokenizer.rs | 141 ++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 76 deletions(-) diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs index a3476b1b8..147329298 100644 --- a/src/dialect/databricks.rs +++ b/src/dialect/databricks.rs @@ -64,4 +64,9 @@ impl Dialect for DatabricksDialect { fn supports_struct_literal(&self) -> bool { true } + + // https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment + fn supports_nested_comments(&self) -> bool { + true + } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8382a5344..54a158c1f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2419,7 +2419,7 @@ mod tests { use crate::dialect::{ BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect, }; - use crate::test_utils::all_dialects_where; + use crate::test_utils::{all_dialects_except, all_dialects_where}; use core::fmt::Debug; #[test] @@ -3169,90 +3169,79 @@ mod tests { #[test] fn tokenize_nested_multiline_comment() { - let dialect = GenericDialect {}; - let test_cases = vec![ - ( - "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", - vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), - )), - Token::Whitespace(Whitespace::Space), - Token::Div, - Token::Word(Word { - value: "comment".to_string(), - quote_style: None, - keyword: Keyword::COMMENT, - }), - Token::Mul, - Token::Div, - Token::Number("1".to_string(), false), - ], - ), - ( - "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", - vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), - )), - Token::Number("1".to_string(), false), - ], - ), - ( - "SELECT 1/* a /* b */ c */0", - vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), - Token::Number("0".to_string(), false), - ], - ), - ]; + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), + )), + Token::Whitespace(Whitespace::Space), + Token::Div, + Token::Word(Word { + value: "comment".to_string(), + quote_style: None, + keyword: Keyword::COMMENT, + }), + Token::Mul, + Token::Div, + Token::Number("1".to_string(), false), + ], + ); - for (sql, expected) in test_cases { - let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - compare(expected, tokens); - } + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), + )), + Token::Number("1".to_string(), false), + ], + ); + + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "SELECT 1/* a /* b */ c */0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), + Token::Number("0".to_string(), false), + ], + ); } #[test] fn tokenize_nested_multiline_comment_empty() { - let sql = "select 1/*/**/*/0"; - - let dialect = GenericDialect {}; - let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![ - Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), - Token::Number("0".to_string(), false), - ]; - - compare(expected, tokens); + all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( + "select 1/*/**/*/0", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), + Token::Number("0".to_string(), false), + ], + ); } #[test] fn tokenize_nested_comments_if_not_supported() { - let dialect = SQLiteDialect {}; - let sql = "SELECT 1/*/* nested comment */*/0"; - let tokens = Tokenizer::new(&dialect, sql).tokenize(); - let expected = vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "/* nested comment ".to_string(), - )), - Token::Mul, - Token::Div, - Token::Number("0".to_string(), false), - ]; - - compare(expected, tokens.unwrap()); + all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to( + "SELECT 1/*/* nested comment */*/0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "/* nested comment ".to_string(), + )), + Token::Mul, + Token::Div, + Token::Number("0".to_string(), false), + ], + ); } #[test] From 124184bec3d3d9bab703724ba80be42461b33f37 Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 17:08:28 -0700 Subject: [PATCH 2/5] clickhouse too --- src/dialect/clickhouse.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs index f5e70c309..aeeef6d64 100644 --- a/src/dialect/clickhouse.rs +++ b/src/dialect/clickhouse.rs @@ -94,4 +94,10 @@ impl Dialect for ClickHouseDialect { fn supports_group_by_with_modifier(&self) -> bool { true } + + // Supported since 2020. + // See + fn supports_nested_comments(&self) -> bool { + true + } } From 3464a8fda6fa10810dc7778e7520df66ac48dfcd Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Tue, 23 Sep 2025 17:12:36 -0700 Subject: [PATCH 3/5] believe it or not, ansi --- src/dialect/ansi.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs index 32ba7b32a..ce1755a34 100644 --- a/src/dialect/ansi.rs +++ b/src/dialect/ansi.rs @@ -33,4 +33,9 @@ impl Dialect for AnsiDialect { fn require_interval_qualifier(&self) -> bool { true } + + // The SQL standard explictly states that block comments nest. + fn supports_nested_comments(&self) -> bool { + true + } } From ab184876b53fb5364b10d08ca961eeaf950303d1 Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Thu, 25 Sep 2025 08:32:43 -0700 Subject: [PATCH 4/5] address feedback --- src/dialect/ansi.rs | 2 +- src/dialect/clickhouse.rs | 4 ++-- src/dialect/databricks.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs index ce1755a34..ec3c095be 100644 --- a/src/dialect/ansi.rs +++ b/src/dialect/ansi.rs @@ -34,7 +34,7 @@ impl Dialect for AnsiDialect { true } - // The SQL standard explictly states that block comments nest. + /// The SQL standard explicitly states that block comments nest. fn supports_nested_comments(&self) -> bool { true } diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs index aeeef6d64..bdac1f57b 100644 --- a/src/dialect/clickhouse.rs +++ b/src/dialect/clickhouse.rs @@ -95,8 +95,8 @@ impl Dialect for ClickHouseDialect { true } - // Supported since 2020. - // See + /// Supported since 2020. + /// See fn supports_nested_comments(&self) -> bool { true } diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs index 147329298..bb27610d4 100644 --- a/src/dialect/databricks.rs +++ b/src/dialect/databricks.rs @@ -65,7 +65,7 @@ impl Dialect for DatabricksDialect { true } - // https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment + /// https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment fn supports_nested_comments(&self) -> bool { true } From db2bbfd3beeb39d7cb6ec439c75f66eb42187f2c Mon Sep 17 00:00:00 2001 From: Joey Hain Date: Thu, 25 Sep 2025 08:34:04 -0700 Subject: [PATCH 5/5] fix doc error --- src/dialect/databricks.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs index bb27610d4..4bb8c8d51 100644 --- a/src/dialect/databricks.rs +++ b/src/dialect/databricks.rs @@ -65,7 +65,7 @@ impl Dialect for DatabricksDialect { true } - /// https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment + /// See fn supports_nested_comments(&self) -> bool { true }