From 52338d6ef5354b1441e00536ebe0f24e7bf48911 Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 28 Oct 2025 15:27:43 +0100 Subject: [PATCH 01/10] Started to remove whitespace --- src/dialect/snowflake.rs | 2 +- src/parser/mod.rs | 63 +------ src/tokenizer.rs | 324 ++++------------------------------- tests/sqlparser_snowflake.rs | 11 -- 4 files changed, 40 insertions(+), 360 deletions(-) diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 825fd45f0..6cb344fac 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -1051,7 +1051,7 @@ pub fn parse_stage_name_identifier(parser: &mut Parser) -> Result break, + Token::SemiColon => break, Token::Period => { parser.prev_token(); break; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b44171c7d..ab981f9f5 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -4034,13 +4034,6 @@ impl<'a> Parser<'a> { core::array::from_fn(|_| loop { let token = self.tokens.get(index); index += 1; - if let Some(TokenWithSpan { - token: Token::Whitespace(_), - span: _, - }) = token - { - continue; - } break token.cloned().unwrap_or(TokenWithSpan { token: Token::EOF, span: Span::empty(), @@ -4057,13 +4050,6 @@ impl<'a> Parser<'a> { core::array::from_fn(|_| loop { let token = self.tokens.get(index); index += 1; - if let Some(TokenWithSpan { - token: Token::Whitespace(_), - span: _, - }) = token - { - continue; - } break token.unwrap_or(&EOF_TOKEN); }) } @@ -4078,18 +4064,10 @@ impl<'a> Parser<'a> { let mut index = self.index; loop { index += 1; - match self.tokens.get(index - 1) { - Some(TokenWithSpan { - token: Token::Whitespace(_), - span: _, - }) => continue, - non_whitespace => { - if n == 0 { - return non_whitespace.unwrap_or(&EOF_TOKEN); - } - n -= 1; - } + if n == 0 { + return self.tokens.get(index - 1).unwrap_or(&EOF_TOKEN); } + n -= 1; } } @@ -4147,16 +4125,7 @@ impl<'a> Parser<'a> { /// /// See [`Self::get_current_token`] to get the current token after advancing pub fn advance_token(&mut self) { - loop { - self.index += 1; - match self.tokens.get(self.index - 1) { - Some(TokenWithSpan { - token: Token::Whitespace(_), - span: _, - }) => continue, - _ => break, - } - } + self.index += 1; } /// Returns a reference to the current token @@ -4187,18 +4156,8 @@ impl<'a> Parser<'a> { /// // TODO rename to backup_token and deprecate prev_token? pub fn prev_token(&mut self) { - loop { - assert!(self.index > 0); - self.index -= 1; - if let Some(TokenWithSpan { - token: Token::Whitespace(_), - span: _, - }) = self.tokens.get(self.index) - { - continue; - } - return; - } + assert!(self.index > 0); + self.index -= 1; } /// Report `found` was encountered instead of `expected` @@ -9999,14 +9958,6 @@ impl<'a> Parser<'a> { let mut content = String::from(""); while let Some(t) = self.next_token_no_skip().map(|t| &t.token) { match t { - Token::Whitespace(Whitespace::Tab) => { - values.push(Some(content.to_string())); - content.clear(); - } - Token::Whitespace(Whitespace::Newline) => { - values.push(Some(content.to_string())); - content.clear(); - } Token::Backslash => { if self.consume_token(&Token::Period) { return values; @@ -11396,7 +11347,7 @@ impl<'a> Parser<'a> { // otherwise foo-123a will be parsed as `foo-123` with the alias `a`. if requires_whitespace { let token = self.next_token(); - if !matches!(token.token, Token::EOF | Token::Whitespace(_)) { + if !matches!(token.token, Token::EOF) { return self .expected("whitespace following hyphenated identifier", token); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 54a158c1f..451545157 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -106,8 +106,6 @@ pub enum Token { HexStringLiteral(String), /// Comma Comma, - /// Whitespace (space, tab, etc) - Whitespace(Whitespace), /// Double equals sign `==` DoubleEq, /// Equality operator `=` @@ -304,7 +302,6 @@ impl fmt::Display for Token { Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"), Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""), Token::Comma => f.write_str(","), - Token::Whitespace(ws) => write!(f, "{ws}"), Token::DoubleEq => f.write_str("=="), Token::Spaceship => f.write_str("<=>"), Token::Eq => f.write_str("="), @@ -449,29 +446,6 @@ impl Word { } } -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] -pub enum Whitespace { - Space, - Newline, - Tab, - SingleLineComment { comment: String, prefix: String }, - MultiLineComment(String), -} - -impl fmt::Display for Whitespace { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Whitespace::Space => f.write_str(" "), - Whitespace::Newline => f.write_str("\n"), - Whitespace::Tab => f.write_str("\t"), - Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"), - Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"), - } - } -} - /// Location in input string /// /// # Create an "empty" (unknown) `Location` @@ -898,7 +872,7 @@ impl<'a> Tokenizer<'a> { }; let mut location = state.location(); - while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { + while let Some(token) = self.next_token(&mut location, &mut state, buf.last().map(|t| &t.token), false)? { let span = location.span_to(state.location()); buf.push(TokenWithSpan { token, span }); @@ -937,22 +911,18 @@ impl<'a> Tokenizer<'a> { /// Get the next token or return None fn next_token( &self, + location: &mut Location, chars: &mut State, prev_token: Option<&Token>, + preceded_by_whitespace: bool, ) -> Result, TokenizerError> { match chars.peek() { Some(&ch) => match ch { - ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), - '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)), - '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)), - '\r' => { - // Emit a single Whitespace::Newline token for \r and \r\n - chars.next(); - if let Some('\n') = chars.peek() { - chars.next(); - } - Ok(Some(Token::Whitespace(Whitespace::Newline))) - } + ' ' | '\t' | '\n' | '\r' => { + chars.next(); // consume + *location = chars.location(); + self.next_token(location, chars, prev_token, true) + }, // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) => { @@ -1331,13 +1301,10 @@ impl<'a> Tokenizer<'a> { if is_comment { chars.next(); // consume second '-' - let comment = self.tokenize_single_line_comment(chars); - return Ok(Some(Token::Whitespace( - Whitespace::SingleLineComment { - prefix: "--".to_owned(), - comment, - }, - ))); + // Consume the rest of the line as comment + let _comment = self.tokenize_single_line_comment(chars); + *location = chars.location(); + return self.next_token(location, chars, prev_token, true); } self.start_binop(chars, "-", Token::Minus) @@ -1358,15 +1325,16 @@ impl<'a> Tokenizer<'a> { match chars.peek() { Some('*') => { chars.next(); // consume the '*', starting a multi-line comment - self.tokenize_multiline_comment(chars) + let _comment = self.consume_multiline_comment(chars)?; + *location = chars.location(); + self.next_token(location, chars, prev_token, true) } Some('/') if dialect_of!(self is SnowflakeDialect) => { chars.next(); // consume the second '/', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars); - Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { - prefix: "//".to_owned(), - comment, - }))) + // Consume the rest of the line as comment + let _comment = self.tokenize_single_line_comment(chars); + *location = chars.location(); + self.next_token(location, chars, prev_token, true) } Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => { self.consume_and_return(chars, Token::DuckIntDiv) @@ -1567,11 +1535,10 @@ impl<'a> Tokenizer<'a> { '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => { chars.next(); // consume the '#', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars); - Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { - prefix: "#".to_owned(), - comment, - }))) + // Consume the rest of the line as comment + let _comment = self.tokenize_single_line_comment(chars); + *location = chars.location(); + self.next_token(location, chars, prev_token, true) } '~' => { chars.next(); // consume @@ -1701,7 +1668,9 @@ impl<'a> Tokenizer<'a> { // whitespace check (including unicode chars) should be last as it covers some of the chars above ch if ch.is_whitespace() => { - self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)) + chars.next(); // consume + *location = chars.location(); + self.next_token(location, chars, prev_token, true) } other => self.consume_and_return(chars, Token::Char(other)), }, @@ -2101,10 +2070,10 @@ impl<'a> Tokenizer<'a> { self.tokenizer_error(error_loc, "Unterminated string literal") } - fn tokenize_multiline_comment( + fn consume_multiline_comment( &self, chars: &mut State, - ) -> Result, TokenizerError> { + ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; let supports_nested_comments = self.dialect.supports_nested_comments(); @@ -2121,7 +2090,7 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume the '/' nested -= 1; if nested == 0 { - break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); + break Ok(Some(s)); } s.push('*'); s.push('/'); @@ -2444,7 +2413,6 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; @@ -2459,7 +2427,6 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from(".1"), false), ]; @@ -2475,7 +2442,6 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Word(Word { value: "foo".to_string(), quote_style: None, @@ -2496,7 +2462,6 @@ mod tests { let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number("10".to_string(), false), Token::make_word("_000", None), ]; @@ -2506,17 +2471,13 @@ mod tests { "SELECT 10_000, _10_000, 10_00_, 10___0", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number("10_000".to_string(), false), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier) Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number("10_00".to_string(), false), Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects) Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number("10".to_string(), false), Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects) ], @@ -2531,24 +2492,18 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1e10"), false), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1e-10"), false), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1e+10"), false), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), Token::make_word("ea", None), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1e-10"), false), Token::make_word("a", None), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1e-10"), false), Token::Minus, Token::Number(String::from("10"), false), @@ -2565,7 +2520,6 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_word("sqrt", None), Token::LParen, Token::Number(String::from("1"), false), @@ -2583,11 +2537,8 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("a")), - Token::Whitespace(Whitespace::Space), Token::StringConcat, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("b")), ]; @@ -2601,15 +2552,10 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_word("one", None), - Token::Whitespace(Whitespace::Space), Token::Pipe, - Token::Whitespace(Whitespace::Space), Token::make_word("two", None), - Token::Whitespace(Whitespace::Space), Token::Caret, - Token::Whitespace(Whitespace::Space), Token::make_word("three", None), ]; compare(expected, tokens); @@ -2624,32 +2570,20 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("true"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("XOR"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("true"), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_keyword("false"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("XOR"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("false"), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_keyword("true"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("XOR"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("false"), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_keyword("false"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("XOR"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("true"), ]; compare(expected, tokens); @@ -2663,23 +2597,14 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), - Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), - Token::Whitespace(Whitespace::Space), Token::make_word("id", None), - Token::Whitespace(Whitespace::Space), Token::Eq, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), - Token::Whitespace(Whitespace::Space), Token::make_keyword("LIMIT"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from("5"), false), ]; @@ -2694,21 +2619,13 @@ mod tests { let expected = vec![ Token::make_keyword("EXPLAIN"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), - Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), - Token::Whitespace(Whitespace::Space), Token::make_word("id", None), - Token::Whitespace(Whitespace::Space), Token::Eq, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; @@ -2723,23 +2640,14 @@ mod tests { let expected = vec![ Token::make_keyword("EXPLAIN"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("ANALYZE"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), - Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), - Token::Whitespace(Whitespace::Space), Token::make_word("id", None), - Token::Whitespace(Whitespace::Space), Token::Eq, - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; @@ -2754,19 +2662,12 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), - Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), - Token::Whitespace(Whitespace::Space), Token::make_word("salary", None), - Token::Whitespace(Whitespace::Space), Token::Neq, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("Not Provided")), ]; @@ -2781,7 +2682,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ - Token::Whitespace(Whitespace::Newline), Token::Char('💝'), Token::make_word("مصطفىh", None), ]; @@ -2839,16 +2739,10 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ - Token::Whitespace(Whitespace::Newline), - Token::Whitespace(Whitespace::Newline), Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("table"), - Token::Whitespace(Whitespace::Tab), Token::Char('💝'), Token::make_word("مصطفىh", None), ]; @@ -2862,7 +2756,6 @@ mod tests { String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"), vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(), tag: Some("tag".into()), @@ -2873,7 +2766,6 @@ mod tests { String::from("SELECT $abc$x$ab$abc$"), vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "x$ab".into(), tag: Some("abc".into()), @@ -2884,7 +2776,6 @@ mod tests { String::from("SELECT $abc$$abc$"), vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "".into(), tag: Some("abc".into()), @@ -2961,16 +2852,12 @@ mod tests { tokens, vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Placeholder("$$".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Placeholder("$$ABC$$".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Placeholder("$ABC$".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::Placeholder("$ABC".into()), ] ); @@ -2983,7 +2870,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "dollar $nested$ string".into(), tag: Some("tag".into()), @@ -2999,7 +2885,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "".into(), tag: None, @@ -3016,7 +2901,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::DollarQuotedString(DollarQuotedString { value: "within dollar '$' quoted strings have $tags like this$ ".into(), tag: None, @@ -3067,9 +2951,7 @@ mod tests { let expected = vec![ Token::make_word("a", None), - Token::Whitespace(Whitespace::Space), Token::make_keyword("IS"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("NULL"), ]; @@ -3083,10 +2965,6 @@ mod tests { String::from("0--this is a comment\n1"), vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\n".to_string(), - }), Token::Number("1".to_string(), false), ], ), @@ -3094,20 +2972,12 @@ mod tests { String::from("0--this is a comment\r1"), vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\r1".to_string(), - }), ], ), ( String::from("0--this is a comment\r\n1"), vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\r\n".to_string(), - }), Token::Number("1".to_string(), false), ], ), @@ -3129,10 +2999,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "\r".to_string(), - }), Token::Number("0".to_string(), false), ]; compare(expected, tokens); @@ -3144,11 +3010,7 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment".to_string(), - })]; - compare(expected, tokens); + assert!(tokens.is_empty()); } #[test] @@ -3159,9 +3021,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* /comment".to_string(), - )), Token::Number("1".to_string(), false), ]; compare(expected, tokens); @@ -3173,10 +3032,6 @@ mod tests { "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), - )), - Token::Whitespace(Whitespace::Space), Token::Div, Token::Word(Word { value: "comment".to_string(), @@ -3193,9 +3048,6 @@ mod tests { "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", vec![ Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), - )), Token::Number("1".to_string(), false), ], ); @@ -3204,9 +3056,7 @@ mod tests { "SELECT 1/* a /* b */ c */0", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), Token::Number("0".to_string(), false), ], ); @@ -3218,9 +3068,7 @@ mod tests { "select 1/*/**/*/0", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), Token::Number("0".to_string(), false), ], ); @@ -3232,11 +3080,7 @@ mod tests { "SELECT 1/*/* nested comment */*/0", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "/* nested comment ".to_string(), - )), Token::Mul, Token::Div, Token::Number("0".to_string(), false), @@ -3250,12 +3094,7 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![ - Token::Whitespace(Whitespace::Newline), - Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())), - Token::Whitespace(Whitespace::Newline), - ]; - compare(expected, tokens); + assert!(tokens.is_empty()); } #[test] @@ -3264,12 +3103,7 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![ - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::Newline), - ]; - compare(expected, tokens); + assert!(tokens.is_empty()); } #[test] @@ -3295,13 +3129,9 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::make_word("line1", None), - Token::Whitespace(Whitespace::Newline), Token::make_word("line2", None), - Token::Whitespace(Whitespace::Newline), Token::make_word("line3", None), - Token::Whitespace(Whitespace::Newline), Token::make_word("line4", None), - Token::Whitespace(Whitespace::Newline), ]; compare(expected, tokens); } @@ -3313,15 +3143,10 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("TOP"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from("5"), false), - Token::Whitespace(Whitespace::Space), Token::make_word("bar", Some('[')), - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("foo", None), ]; compare(expected, tokens); @@ -3334,32 +3159,20 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::Tilde, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::TildeAsterisk, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::ExclamationMarkTilde, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::ExclamationMarkTildeAsterisk, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), ]; compare(expected, tokens); @@ -3372,32 +3185,20 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::DoubleTilde, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("_a%".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::DoubleTildeAsterisk, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("_a%".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::ExclamationMarkDoubleTilde, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("_a%".into()), Token::Comma, - Token::Whitespace(Whitespace::Space), Token::make_word("col", None), - Token::Whitespace(Whitespace::Space), Token::ExclamationMarkDoubleTildeAsterisk, - Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("_a%".into()), ]; compare(expected, tokens); @@ -3409,13 +3210,9 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ - Token::Whitespace(Whitespace::Space), Token::make_word(r#"a " b"#, Some('"')), - Token::Whitespace(Whitespace::Space), Token::make_word(r#"a ""#, Some('"')), - Token::Whitespace(Whitespace::Space), Token::make_word(r#"c """#, Some('"')), - Token::Whitespace(Whitespace::Space), ]; compare(expected, tokens); } @@ -3442,13 +3239,9 @@ mod tests { .tokenize() .unwrap(); let expected = vec![ - Token::Whitespace(Whitespace::Space), Token::make_word(r#"a "" b"#, Some('"')), - Token::Whitespace(Whitespace::Space), Token::make_word(r#"a """#, Some('"')), - Token::Whitespace(Whitespace::Space), Token::make_word(r#"c """""#, Some('"')), - Token::Whitespace(Whitespace::Space), ]; compare(expected, tokens); } @@ -3462,23 +3255,8 @@ mod tests { .unwrap(); let expected = vec![ TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()), - TokenWithSpan::at( - Token::Whitespace(Whitespace::Space), - (1, 7).into(), - (1, 8).into(), - ), TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()), TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()), - TokenWithSpan::at( - Token::Whitespace(Whitespace::Newline), - (1, 10).into(), - (2, 1).into(), - ), - TokenWithSpan::at( - Token::Whitespace(Whitespace::Space), - (2, 1).into(), - (2, 2).into(), - ), TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()), ]; compare(expected, tokens); @@ -3600,11 +3378,8 @@ mod tests { let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Mul, - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; compare(expected, tokens); @@ -3802,9 +3577,7 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("CREATE"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("USER"), - Token::Whitespace(Whitespace::Space), Token::make_word("root", Some('`')), Token::AtSign, Token::make_word("%", Some('`')), @@ -3820,7 +3593,6 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::AtSign, Token::SingleQuotedString("1".to_string()), ]; @@ -3835,12 +3607,9 @@ mod tests { let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::AtSign, Token::DoubleQuotedString("bar".to_string()), - Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), - Token::Whitespace(Whitespace::Space), Token::make_word("foo", None), ]; compare(expected, tokens); @@ -3853,7 +3622,6 @@ mod tests { "select n'''''\\'", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::NationalStringLiteral("''\\".to_string()), ], ); @@ -3866,7 +3634,6 @@ mod tests { "select n'''''\\''", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::NationalStringLiteral("'''".to_string()), ], ); @@ -3878,7 +3645,6 @@ mod tests { "select e'...'", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::make_word("e", None), Token::SingleQuotedString("...".to_string()), ], @@ -3888,7 +3654,6 @@ mod tests { "select E'...'", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::make_word("E", None), Token::SingleQuotedString("...".to_string()), ], @@ -3901,7 +3666,6 @@ mod tests { "select e'\\''", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::EscapedStringLiteral("'".to_string()), ], ); @@ -3910,7 +3674,6 @@ mod tests { "select E'\\''", vec![ Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), Token::EscapedStringLiteral("'".to_string()), ], ); @@ -3923,7 +3686,6 @@ mod tests { "SELECT --'abc'", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Minus, Token::Minus, Token::SingleQuotedString("abc".to_string()), @@ -3935,11 +3697,6 @@ mod tests { "SELECT -- 'abc'", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: " 'abc'".to_string(), - }), ], ); @@ -3948,7 +3705,6 @@ mod tests { "SELECT --", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Minus, Token::Minus, ], @@ -3962,11 +3718,6 @@ mod tests { "SELECT --'abc'", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "'abc'".to_string(), - }), ], ); @@ -3975,11 +3726,6 @@ mod tests { "SELECT -- 'abc'", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: " 'abc'".to_string(), - }), ], ); @@ -3988,11 +3734,6 @@ mod tests { "SELECT --", vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "".to_string(), - }), ], ); } @@ -4033,7 +3774,6 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), Token::Word(Word { value: "table".to_string(), quote_style: None, diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index e7a128343..638b4aca3 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -563,12 +563,7 @@ fn test_snowflake_single_line_tokenize() { let expected = vec![ Token::make_keyword("CREATE"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("TABLE"), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "#".to_string(), - comment: " this is a comment \n".to_string(), - }), Token::make_word("table_1", None), ]; @@ -579,13 +574,7 @@ fn test_snowflake_single_line_tokenize() { let expected = vec![ Token::make_keyword("CREATE"), - Token::Whitespace(Whitespace::Space), Token::make_keyword("TABLE"), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::SingleLineComment { - prefix: "//".to_string(), - comment: " this is a comment \n".to_string(), - }), Token::make_word("table_1", None), ]; From c75f11bf478a2911fb1ad0ffe70b5aeda8e72bef Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 28 Oct 2025 16:26:35 +0100 Subject: [PATCH 02/10] Extended placeholder syntax test and moved check in tokenizer --- src/parser/mod.rs | 2 +- src/tokenizer.rs | 26 ++++++++++++++++++++++++-- tests/sqlparser_bigquery.rs | 5 ++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index ab981f9f5..a51781fbc 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -18475,7 +18475,7 @@ mod tests { #[test] fn test_placeholder_invalid_whitespace() { - for w in [" ", "/*invalid*/"] { + for w in [" ", " ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] { let sql = format!("\nSELECT\n :{w}fooBar"); assert!(Parser::parse_sql(&GenericDialect, &sql).is_err()); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 451545157..1dffb8c58 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -908,6 +908,22 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word, None))) } + /// Returns a standardized error if the previous token is a `:` and + /// the method is expected to be called when a space is found after it. + fn handle_colon_space_error( + &self, + chars: &State, + prev_token: Option<&Token>, + ) -> Result, TokenizerError> { + if let Some(Token::Colon) = prev_token { + return Err(TokenizerError { + message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?".to_string(), + location: chars.location(), + }); + } + Ok(None) + } + /// Get the next token or return None fn next_token( &self, @@ -919,6 +935,7 @@ impl<'a> Tokenizer<'a> { match chars.peek() { Some(&ch) => match ch { ' ' | '\t' | '\n' | '\r' => { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume *location = chars.location(); self.next_token(location, chars, prev_token, true) @@ -1166,7 +1183,7 @@ impl<'a> Tokenizer<'a> { // if the prev token is not a word, then this is not a valid sql // word or number. if ch == '.' && chars.peekable.clone().nth(1) == Some('_') { - if let Some(Token::Word(_)) = prev_token { + if !preceded_by_whitespace { chars.next(); return Ok(Some(Token::Period)); } @@ -1210,7 +1227,7 @@ impl<'a> Tokenizer<'a> { // we should yield the dot as a dedicated token so compound identifiers // starting with digits can be parsed correctly. if s == "." && self.dialect.supports_numeric_prefix() { - if let Some(Token::Word(_)) = prev_token { + if !preceded_by_whitespace { return Ok(Some(Token::Period)); } } @@ -1300,6 +1317,7 @@ impl<'a> Tokenizer<'a> { } if is_comment { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume second '-' // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); @@ -1324,12 +1342,14 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume the '/' match chars.peek() { Some('*') => { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume the '*', starting a multi-line comment let _comment = self.consume_multiline_comment(chars)?; *location = chars.location(); self.next_token(location, chars, prev_token, true) } Some('/') if dialect_of!(self is SnowflakeDialect) => { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume the second '/', starting a snowflake single-line comment // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); @@ -1534,6 +1554,7 @@ impl<'a> Tokenizer<'a> { '}' => self.consume_and_return(chars, Token::RBrace), '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume the '#', starting a snowflake single-line comment // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); @@ -1668,6 +1689,7 @@ impl<'a> Tokenizer<'a> { // whitespace check (including unicode chars) should be last as it covers some of the chars above ch if ch.is_whitespace() => { + self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume *location = chars.location(); self.next_token(location, chars, prev_token, true) diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index 03a0ac813..9f1e72aae 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -1567,7 +1567,10 @@ fn parse_table_identifiers() { fn test_table_ident_err(ident: &str) { let sql = format!("SELECT 1 FROM {ident}"); - assert!(bigquery().parse_sql_statements(&sql).is_err()); + assert!( + bigquery().parse_sql_statements(&sql).is_err(), + "Expected error parsing identifier: `{ident}`, within SQL: `{sql}`" + ); } test_table_ident("`spa ce`", None, vec![Ident::with_quote('`', "spa ce")]); From 1b8d716182009fbdc5891c1c44bc6c2e22598018 Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 28 Oct 2025 16:36:51 +0100 Subject: [PATCH 03/10] Made `test_table_ident_err` more verbose --- tests/sqlparser_bigquery.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index 9f1e72aae..c29b98da6 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -1567,9 +1567,10 @@ fn parse_table_identifiers() { fn test_table_ident_err(ident: &str) { let sql = format!("SELECT 1 FROM {ident}"); + let parsed = bigquery().parse_sql_statements(&sql); assert!( - bigquery().parse_sql_statements(&sql).is_err(), - "Expected error parsing identifier: `{ident}`, within SQL: `{sql}`" + parsed.is_err(), + "Expected error parsing identifier: `{ident}`, within SQL: `{sql}` - but got success: {parsed:#?}" ); } From b862dc7eab00b45913cf1af742a7d3b53ab95998 Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 28 Oct 2025 22:56:31 +0100 Subject: [PATCH 04/10] Added handling of CSVs in COPY STDIN --- Cargo.toml | 1 + src/ast/mod.rs | 78 ++++++++++-- src/dialect/bigquery.rs | 2 +- src/dialect/snowflake.rs | 6 +- src/parser/mod.rs | 249 ++++++++++++++++++++++++--------------- src/tokenizer.rs | 174 +++++++++++++++++---------- 6 files changed, 339 insertions(+), 171 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ed94bbbdd..005cb4567 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ visitor = ["sqlparser_derive"] [dependencies] bigdecimal = { version = "0.4.1", features = ["serde"], optional = true } log = "0.4" +csv = "1.4.0" recursive = { version = "0.1.1", optional = true} serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 176d36545..184560a96 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -3227,7 +3227,7 @@ pub enum Statement { /// WITH options (before PostgreSQL version 9.0) legacy_options: Vec, /// VALUES a vector of values to be copied - values: Vec>, + values: Vec>>, }, /// ```sql /// COPY INTO | @@ -4579,18 +4579,76 @@ impl fmt::Display for Statement { if !legacy_options.is_empty() { write!(f, " {}", display_separated(legacy_options, " "))?; } + + let mut null_symbol = "\\N"; + let mut writer_builder = csv::WriterBuilder::new(); + + // Apply options + for option in options { + match option { + CopyOption::Delimiter(c) => { + writer_builder.delimiter(*c as u8); + } + CopyOption::Quote(c) => { + writer_builder.quote(*c as u8); + } + CopyOption::Escape(c) => { + writer_builder.escape(*c as u8); + } + CopyOption::Null(null) => { + null_symbol = null; + } + _ => {} + } + } + + // Apply legacy options + for option in legacy_options { + match option { + CopyLegacyOption::Delimiter(c) => { + writer_builder.delimiter(*c as u8); + } + CopyLegacyOption::Header => { + writer_builder.has_headers(true); + } + CopyLegacyOption::Null(null) => { + null_symbol = null; + } + CopyLegacyOption::Csv(csv_options) => { + for csv_option in csv_options { + match csv_option { + CopyLegacyCsvOption::Header => { + writer_builder.has_headers(true); + } + CopyLegacyCsvOption::Quote(c) => { + writer_builder.quote(*c as u8); + } + CopyLegacyCsvOption::Escape(c) => { + writer_builder.escape(*c as u8); + } + _ => {} + } + } + } + _ => {} + } + } + if !values.is_empty() { writeln!(f, ";")?; - let mut delim = ""; - for v in values { - write!(f, "{delim}")?; - delim = "\t"; - if let Some(v) = v { - write!(f, "{v}")?; - } else { - write!(f, "\\N")?; - } + let mut writer = writer_builder.from_writer(vec![]); + for row in values { + writer + .write_record( + row.iter() + .map(|column| column.as_deref().unwrap_or(null_symbol)), + ) + .map_err(|_| fmt::Error)? } + writer.flush().map_err(|_| fmt::Error)?; + let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?) + .map_err(|_| fmt::Error)?; + write!(f, "{}", data)?; write!(f, "\n\\.")?; } Ok(()) diff --git a/src/dialect/bigquery.rs b/src/dialect/bigquery.rs index 27fd3cca3..78b830fc9 100644 --- a/src/dialect/bigquery.rs +++ b/src/dialect/bigquery.rs @@ -83,7 +83,7 @@ impl Dialect for BigQueryDialect { } fn is_identifier_part(&self, ch: char) -> bool { - ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' + ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-' } /// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 6cb344fac..6b40125e3 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -1049,9 +1049,9 @@ pub fn parse_create_stage( pub fn parse_stage_name_identifier(parser: &mut Parser) -> Result { let mut ident = String::new(); - while let Some(next_token) = parser.next_token_no_skip() { - match &next_token.token { - Token::SemiColon => break, + loop { + match &parser.next_token().token { + Token::SemiColon | Token::EOF => break, Token::Period => { parser.prev_token(); break; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index a51781fbc..42dc758fb 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -4071,23 +4071,6 @@ impl<'a> Parser<'a> { } } - /// Return the first token, possibly whitespace, that has not yet been processed - /// (or None if reached end-of-file). - pub fn peek_token_no_skip(&self) -> TokenWithSpan { - self.peek_nth_token_no_skip(0) - } - - /// Return nth token, possibly whitespace, that has not yet been processed. - pub fn peek_nth_token_no_skip(&self, n: usize) -> TokenWithSpan { - self.tokens - .get(self.index + n) - .cloned() - .unwrap_or(TokenWithSpan { - token: Token::EOF, - span: Span::empty(), - }) - } - /// Return true if the next tokens exactly `expected` /// /// Does not advance the current token. @@ -4115,12 +4098,6 @@ impl<'a> Parser<'a> { self.index.saturating_sub(1) } - /// Return the next unprocessed token, possibly whitespace. - pub fn next_token_no_skip(&mut self) -> Option<&TokenWithSpan> { - self.index += 1; - self.tokens.get(self.index - 1) - } - /// Advances the current token to the next non-whitespace token /// /// See [`Self::get_current_token`] to get the current token after advancing @@ -9556,6 +9533,101 @@ impl<'a> Parser<'a> { } } + pub fn parse_csv_body( + &mut self, + options: &[CopyOption], + legacy_options: &[CopyLegacyOption], + ) -> Result>>, ParserError> { + let Token::CopyFromStdin(body) = self.next_token().token else { + return self.expected( + "COPY ... FROM STDIN with CSV body", + self.peek_token(), + ); + }; + + let mut reader_builder = csv::ReaderBuilder::new(); + + let mut null_symbol = "\\N"; + + // Apply options + for option in options { + match option { + CopyOption::Delimiter(c) => { + reader_builder.delimiter(*c as u8); + } + CopyOption::Header(has_header) => { + reader_builder.has_headers(*has_header); + } + CopyOption::Quote(c) => { + reader_builder.quote(*c as u8); + } + CopyOption::Escape(c) => { + reader_builder.escape(Some(*c as u8)); + } + CopyOption::Null(null) => { + null_symbol = null; + } + _ => {} + } + } + + // Apply legacy options + for option in legacy_options { + match option { + CopyLegacyOption::Delimiter(c) => { + reader_builder.delimiter(*c as u8); + } + CopyLegacyOption::Header => { + reader_builder.has_headers(true); + } + CopyLegacyOption::Null(null) => { + null_symbol = null; + } + CopyLegacyOption::Csv(csv_options) => { + for csv_option in csv_options { + match csv_option { + CopyLegacyCsvOption::Header => { + reader_builder.has_headers(true); + } + CopyLegacyCsvOption::Quote(c) => { + reader_builder.quote(*c as u8); + } + CopyLegacyCsvOption::Escape(c) => { + reader_builder.escape(Some(*c as u8)); + } + _ => {} + } + } + } + _ => {} + } + } + + let mut result = vec![]; + let mut reader = reader_builder.from_reader(body.as_bytes()); + for record in reader.records() { + let record = match record { + Ok(rec) => rec, + Err(e) => { + return Err(ParserError::ParserError(format!( + "Error parsing CSV data: {}", + e + ))) + } + }; + let mut row = vec![]; + for field in record.iter() { + if field == null_symbol { + row.push(None); + } else { + row.push(Some(field.to_string())); + } + } + result.push(row); + } + Ok(result) + } + /// Parse a copy statement pub fn parse_copy(&mut self) -> Result { let source; @@ -9609,7 +9681,7 @@ impl<'a> Parser<'a> { } let values = if let CopyTarget::Stdin = target { self.expect_token(&Token::SemiColon)?; - self.parse_tsv() + self.parse_csv_body(&options, &legacy_options)? } else { vec![] }; @@ -9947,35 +10019,6 @@ impl<'a> Parser<'a> { Ok(s.chars().next().unwrap()) } - /// Parse a tab separated values in - /// COPY payload - pub fn parse_tsv(&mut self) -> Vec> { - self.parse_tab_value() - } - - pub fn parse_tab_value(&mut self) -> Vec> { - let mut values = vec![]; - let mut content = String::from(""); - while let Some(t) = self.next_token_no_skip().map(|t| &t.token) { - match t { - Token::Backslash => { - if self.consume_token(&Token::Period) { - return values; - } - if let Token::Word(w) = self.next_token().token { - if w.value == "N" { - values.push(None); - } - } - } - _ => { - content.push_str(&t.to_string()); - } - } - } - values - } - /// Parse a literal value (numbers, strings, date/time, booleans) pub fn parse_value(&mut self) -> Result { let next_token = self.next_token(); @@ -10069,7 +10112,7 @@ impl<'a> Parser<'a> { // 2. Not calling self.next_token() to enforce `tok` // be followed immediately by a word/number, ie. // without any whitespace in between - let next_token = self.next_token_no_skip().unwrap_or(&EOF_TOKEN).clone(); + let next_token = self.next_token(); let ident = match next_token.token { Token::Word(w) => Ok(w.into_ident(next_token.span)), Token::Number(w, false) => Ok(Ident::with_span(next_token.span, w)), @@ -11293,54 +11336,66 @@ impl<'a> Parser<'a> { /// Return a tuple of the identifier and a boolean indicating it ends with a period. fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> { match self.peek_token().token { + Token::UnquotedDashStringLiteral(lit) => { + let span = self.next_token().span; + Ok(( + Ident { + value: lit, + quote_style: None, + span, + }, + false, + )) + } Token::Word(w) => { let quote_style_is_none = w.quote_style.is_none(); let mut requires_whitespace = false; let mut ident = w.into_ident(self.next_token().span); if quote_style_is_none { - while matches!(self.peek_token_no_skip().token, Token::Minus) { - self.next_token(); - ident.value.push('-'); - - let token = self - .next_token_no_skip() - .cloned() - .unwrap_or(TokenWithSpan::wrap(Token::EOF)); - requires_whitespace = match token.token { - Token::Word(next_word) if next_word.quote_style.is_none() => { - ident.value.push_str(&next_word.value); - false - } - Token::Number(s, false) => { - // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`. - // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`. - // - // If a number token is followed by a period, it is part of an [ObjectName]. - // Return the identifier with `true` if the number token is followed by a period, indicating that - // parsing should continue for the next part of the hyphenated identifier. - if s.ends_with('.') { - let Some(s) = s.split('.').next().filter(|s| { - !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()) - }) else { - return self.expected( - "continuation of hyphenated identifier", - TokenWithSpan::new(Token::Number(s, false), token.span), - ); - }; - ident.value.push_str(s); - return Ok((ident, true)); - } else { - ident.value.push_str(&s); - } - // If next token is period, then it is part of an ObjectName and we don't expect whitespace - // after the number. - !matches!(self.peek_token().token, Token::Period) - } - _ => { - return self - .expected("continuation of hyphenated identifier", token); - } - } + while matches!(self.peek_token().token, Token::Minus) { + unreachable!("Something went wrong in the tokenizer!"); + // self.next_token(); + // ident.value.push('-'); + + // let token = self + // .next_token_no_skip() + // .cloned() + // .unwrap_or(TokenWithSpan::wrap(Token::EOF)); + // requires_whitespace = match token.token { + // Token::Word(next_word) if next_word.quote_style.is_none() => { + // ident.value.push_str(&next_word.value); + // false + // } + // Token::Number(s, false) => { + // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`. + // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`. + // // + // // If a number token is followed by a period, it is part of an [ObjectName]. + // // Return the identifier with `true` if the number token is followed by a period, indicating that + // // parsing should continue for the next part of the hyphenated identifier. + // if s.ends_with('.') { + // let Some(s) = s.split('.').next().filter(|s| { + // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()) + // }) else { + // return self.expected( + // "continuation of hyphenated identifier", + // TokenWithSpan::new(Token::Number(s, false), token.span), + // ); + // }; + // ident.value.push_str(s); + // return Ok((ident, true)); + // } else { + // ident.value.push_str(&s); + // } + // // If next token is period, then it is part of an ObjectName and we don't expect whitespace + // // after the number. + // !matches!(self.peek_token().token, Token::Period) + // } + // _ => { + // return self + // .expected("continuation of hyphenated identifier", token); + // } + // } } // If the last segment was a number, we must check that it's followed by whitespace, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1dffb8c58..82415e056 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -96,6 +96,11 @@ pub enum Token { /// Triple double quoted literal with raw string prefix. Example `R"""abc"""` /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) TripleDoubleQuotedRawStringLiteral(String), + /// An unquoted string literal containing dashes, i.e: 'first-second', + /// which is allowed in some BigQuery contexts + UnquotedDashStringLiteral(String), + /// A CSV body from a `COPY ... FROM STDIN` statement + CopyFromStdin(String), /// "National" string literal: i.e: N'string' NationalStringLiteral(String), /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second' @@ -301,6 +306,8 @@ impl fmt::Display for Token { Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""), Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"), Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""), + Token::UnquotedDashStringLiteral(ref s) => write!(f, "{s}"), + Token::CopyFromStdin(ref s) => write!(f, "{s}\\."), Token::Comma => f.write_str(","), Token::DoubleEq => f.write_str("=="), Token::Spaceship => f.write_str("<=>"), @@ -387,14 +394,18 @@ impl fmt::Display for Token { } impl Token { - pub fn make_keyword(keyword: &str) -> Self { + pub fn make_keyword>(keyword: S) -> Self { Token::make_word(keyword, None) } - pub fn make_word(word: &str, quote_style: Option) -> Self { + pub fn make_word>(word: S, quote_style: Option) -> Self { + let word = word.into(); + if quote_style.is_none() && word.contains('-') { + return Token::UnquotedDashStringLiteral(word); + } let word_uppercase = word.to_uppercase(); Token::Word(Word { - value: word.to_string(), + value: word, quote_style, keyword: if quote_style.is_none() { let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str()); @@ -777,6 +788,62 @@ struct TokenizeQuotedStringSettings { backslash_escape: bool, } +#[derive(Clone, Copy, Default)] +/// Helper struct to handle the logic of the `COPY ... FROM STDIN` statement +/// which may occur in PostgreSQL and some other dialects. +struct CopyStdinHandler { + previous_copy_token_found: bool, + previous_stdin_token_found: bool, + current_copy_token_found: bool, + current_stdin_token_found: bool, +} + +impl CopyStdinHandler { + /// Update the internal state based on the provided token. + fn update(&mut self, token: &Token) { + match token { + Token::Word(Word { + keyword: Keyword::COPY, + .. + }) => { + self.current_copy_token_found = true; + } + Token::Word(Word { + keyword: Keyword::STDIN, + .. + }) if self.current_copy_token_found => { + self.current_stdin_token_found = true; + } + Token::SemiColon => { + self.previous_copy_token_found = self.current_copy_token_found; + self.previous_stdin_token_found = self.current_stdin_token_found; + self.current_copy_token_found = false; + self.current_stdin_token_found = false; + } + _ => {} + } + } + + /// Returns whether the previous tokens indicated a `COPY ... FROM STDIN` statement. + fn is_in_copy_from_stdin(&self) -> bool { + self.previous_copy_token_found && self.previous_stdin_token_found + } + + /// Extracts the CSV string from the provided State. + fn extract_csv_string(&self, state: &mut State) -> Result { + let mut csv_string = String::new(); + let mut last_character_was_cr = false; + while let Some(ch) = state.next() { + if last_character_was_cr && ch == '\\' && state.peek() == Some(&'.') { + break; + } + last_character_was_cr = ch == '\n'; + csv_string.push(ch); + } + Ok(csv_string) + } +} + /// SQL Tokenizer pub struct Tokenizer<'a> { dialect: &'a dyn Dialect, @@ -870,14 +937,29 @@ impl<'a> Tokenizer<'a> { line: 1, col: 1, }; + let mut cs_handler = CopyStdinHandler::default(); let mut location = state.location(); - while let Some(token) = self.next_token(&mut location, &mut state, buf.last().map(|t| &t.token), false)? { + while let Some(token) = self.next_token( + &mut location, + &mut state, + buf.last().map(|t| &t.token), + false, + )? { let span = location.span_to(state.location()); - + cs_handler.update(&token); buf.push(TokenWithSpan { token, span }); - location = state.location(); + + if cs_handler.is_in_copy_from_stdin() { + let csv_string = cs_handler.extract_csv_string(&mut state)?; + let span = location.span_to(state.location()); + buf.push(TokenWithSpan { + token: Token::CopyFromStdin(csv_string), + span, + }); + location = state.location(); + } } Ok(()) } @@ -905,7 +987,7 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::Number(s, false))); } - Ok(Some(Token::make_word(&word, None))) + Ok(Some(Token::make_word(word, None))) } /// Returns a standardized error if the previous token is a `:` and @@ -917,7 +999,8 @@ impl<'a> Tokenizer<'a> { ) -> Result, TokenizerError> { if let Some(Token::Colon) = prev_token { return Err(TokenizerError { - message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?".to_string(), + message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?" + .to_string(), location: chars.location(), }); } @@ -939,7 +1022,7 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume *location = chars.location(); self.next_token(location, chars, prev_token, true) - }, + } // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) => { @@ -976,7 +1059,7 @@ impl<'a> Tokenizer<'a> { _ => { // regular identifier starting with an "b" or "B" let s = self.tokenize_word(b, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } } } @@ -1003,7 +1086,7 @@ impl<'a> Tokenizer<'a> { _ => { // regular identifier starting with an "r" or "R" let s = self.tokenize_word(b, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } } } @@ -1022,7 +1105,7 @@ impl<'a> Tokenizer<'a> { _ => { // regular identifier starting with an "N" let s = self.tokenize_word(n, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } } } @@ -1039,7 +1122,7 @@ impl<'a> Tokenizer<'a> { _ => { // regular identifier starting with an "E" or "e" let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } } } @@ -1058,7 +1141,7 @@ impl<'a> Tokenizer<'a> { } // regular identifier starting with an "U" or "u" let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. @@ -1073,7 +1156,7 @@ impl<'a> Tokenizer<'a> { _ => { // regular identifier starting with an "X" let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(&s, None))) + Ok(Some(Token::make_word(s, None))) } } } @@ -1122,7 +1205,7 @@ impl<'a> Tokenizer<'a> { // delimited (quoted) identifier quote_start if self.dialect.is_delimited_identifier_start(ch) => { let word = self.tokenize_quoted_identifier(quote_start, chars)?; - Ok(Some(Token::make_word(&word, Some(quote_start)))) + Ok(Some(Token::make_word(word, Some(quote_start)))) } // Potentially nested delimited (quoted) identifier quote_start @@ -1146,7 +1229,7 @@ impl<'a> Tokenizer<'a> { let Some(nested_quote_start) = nested_quote_start else { let word = self.tokenize_quoted_identifier(quote_start, chars)?; - return Ok(Some(Token::make_word(&word, Some(quote_start)))); + return Ok(Some(Token::make_word(word, Some(quote_start)))); }; let mut word = vec![]; @@ -1174,7 +1257,7 @@ impl<'a> Tokenizer<'a> { } chars.next(); // skip close delimiter - Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) + Ok(Some(Token::make_word(word.concat(), Some(quote_start)))) } // numbers and period '0'..='9' | '.' => { @@ -1284,12 +1367,12 @@ impl<'a> Tokenizer<'a> { if !word.is_empty() { s += word.as_str(); - return Ok(Some(Token::make_word(s.as_str(), None))); + return Ok(Some(Token::make_word(s, None))); } } else if prev_token == Some(&Token::Period) { // If the previous token was a period, thus not belonging to a number, // the value we have is part of an identifier. - return Ok(Some(Token::make_word(s.as_str(), None))); + return Ok(Some(Token::make_word(s, None))); } } @@ -1319,7 +1402,7 @@ impl<'a> Tokenizer<'a> { if is_comment { self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume second '-' - // Consume the rest of the line as comment + // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); *location = chars.location(); return self.next_token(location, chars, prev_token, true); @@ -1351,7 +1434,7 @@ impl<'a> Tokenizer<'a> { Some('/') if dialect_of!(self is SnowflakeDialect) => { self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume the second '/', starting a snowflake single-line comment - // Consume the rest of the line as comment + // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); *location = chars.location(); self.next_token(location, chars, prev_token, true) @@ -1556,7 +1639,7 @@ impl<'a> Tokenizer<'a> { { self.handle_colon_space_error(chars, prev_token)?; chars.next(); // consume the '#', starting a snowflake single-line comment - // Consume the rest of the line as comment + // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); *location = chars.location(); self.next_token(location, chars, prev_token, true) @@ -1871,7 +1954,7 @@ impl<'a> Tokenizer<'a> { fn tokenize_word(&self, first_chars: impl Into, chars: &mut State) -> String { let mut s = first_chars.into(); s.push_str(&peeking_take_while(chars, |ch| { - self.dialect.is_identifier_part(ch) + self.dialect.is_identifier_part(ch) || ch == '-' && self.dialect.is_identifier_part('-') })); s } @@ -2703,10 +2786,7 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); // println!("tokens: {:#?}", tokens); - let expected = vec![ - Token::Char('💝'), - Token::make_word("مصطفىh", None), - ]; + let expected = vec![Token::Char('💝'), Token::make_word("مصطفىh", None)]; compare(expected, tokens); } @@ -2992,9 +3072,7 @@ mod tests { ), ( String::from("0--this is a comment\r1"), - vec![ - Token::Number("0".to_string(), false), - ], + vec![Token::Number("0".to_string(), false)], ), ( String::from("0--this is a comment\r\n1"), @@ -3715,49 +3793,25 @@ mod tests { ); all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace()) - .tokenizes_to( - "SELECT -- 'abc'", - vec![ - Token::make_keyword("SELECT"), - ], - ); + .tokenizes_to("SELECT -- 'abc'", vec![Token::make_keyword("SELECT")]); all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace()) .tokenizes_to( "SELECT --", - vec![ - Token::make_keyword("SELECT"), - Token::Minus, - Token::Minus, - ], + vec![Token::make_keyword("SELECT"), Token::Minus, Token::Minus], ); } #[test] fn test_whitespace_not_required_after_single_line_comment() { all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) - .tokenizes_to( - "SELECT --'abc'", - vec![ - Token::make_keyword("SELECT"), - ], - ); + .tokenizes_to("SELECT --'abc'", vec![Token::make_keyword("SELECT")]); all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) - .tokenizes_to( - "SELECT -- 'abc'", - vec![ - Token::make_keyword("SELECT"), - ], - ); + .tokenizes_to("SELECT -- 'abc'", vec![Token::make_keyword("SELECT")]); all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) - .tokenizes_to( - "SELECT --", - vec![ - Token::make_keyword("SELECT"), - ], - ); + .tokenizes_to("SELECT --", vec![Token::make_keyword("SELECT")]); } #[test] From 93ea5d2458566251225ab47f7a32a0e86059f9d3 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 09:35:54 +0100 Subject: [PATCH 05/10] Extended CSV STDIN tests and resolved more corner cases in tokenizer --- src/ast/mod.rs | 2 +- src/dialect/bigquery.rs | 6 +- src/dialect/mod.rs | 5 + src/parser/mod.rs | 155 ++-- src/test_utils.rs | 1 + src/tokenizer.rs | 1387 ++++++++++++++++++----------------- tests/sqlparser_common.rs | 1 + tests/sqlparser_postgres.rs | 52 +- 8 files changed, 834 insertions(+), 775 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 184560a96..6ddf32819 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -4649,7 +4649,7 @@ impl fmt::Display for Statement { let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?) .map_err(|_| fmt::Error)?; write!(f, "{}", data)?; - write!(f, "\n\\.")?; + write!(f, "\\.")?; } Ok(()) } diff --git a/src/dialect/bigquery.rs b/src/dialect/bigquery.rs index 78b830fc9..c8a50dd66 100644 --- a/src/dialect/bigquery.rs +++ b/src/dialect/bigquery.rs @@ -83,7 +83,11 @@ impl Dialect for BigQueryDialect { } fn is_identifier_part(&self, ch: char) -> bool { - ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-' + ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' + } + + fn supports_hyphenated_identifiers(&self) -> bool { + true } /// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index ef4e1cdde..abc8291d7 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -178,6 +178,11 @@ pub trait Dialect: Debug + Any { /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; + /// Returns whether the dialect supports hyphenated identifiers + fn supports_hyphenated_identifiers(&self) -> bool { + false + } + /// Most dialects do not have custom operators. Override this method to provide custom operators. fn is_custom_operator_part(&self, _ch: char) -> bool { false diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 42dc758fb..90c52bb87 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -9539,13 +9539,11 @@ impl<'a> Parser<'a> { legacy_options: &[CopyLegacyOption], ) -> Result>>, ParserError> { let Token::CopyFromStdin(body) = self.next_token().token else { - return self.expected( - "COPY ... FROM STDIN with CSV body", - self.peek_token(), - ); + return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token()); }; let mut reader_builder = csv::ReaderBuilder::new(); + reader_builder.has_headers(false); let mut null_symbol = "\\N"; @@ -11336,80 +11334,69 @@ impl<'a> Parser<'a> { /// Return a tuple of the identifier and a boolean indicating it ends with a period. fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> { match self.peek_token().token { - Token::UnquotedDashStringLiteral(lit) => { - let span = self.next_token().span; - Ok(( - Ident { - value: lit, - quote_style: None, - span, - }, - false, - )) - } - Token::Word(w) => { - let quote_style_is_none = w.quote_style.is_none(); - let mut requires_whitespace = false; - let mut ident = w.into_ident(self.next_token().span); - if quote_style_is_none { - while matches!(self.peek_token().token, Token::Minus) { - unreachable!("Something went wrong in the tokenizer!"); - // self.next_token(); - // ident.value.push('-'); - - // let token = self - // .next_token_no_skip() - // .cloned() - // .unwrap_or(TokenWithSpan::wrap(Token::EOF)); - // requires_whitespace = match token.token { - // Token::Word(next_word) if next_word.quote_style.is_none() => { - // ident.value.push_str(&next_word.value); - // false - // } - // Token::Number(s, false) => { - // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`. - // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`. - // // - // // If a number token is followed by a period, it is part of an [ObjectName]. - // // Return the identifier with `true` if the number token is followed by a period, indicating that - // // parsing should continue for the next part of the hyphenated identifier. - // if s.ends_with('.') { - // let Some(s) = s.split('.').next().filter(|s| { - // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()) - // }) else { - // return self.expected( - // "continuation of hyphenated identifier", - // TokenWithSpan::new(Token::Number(s, false), token.span), - // ); - // }; - // ident.value.push_str(s); - // return Ok((ident, true)); - // } else { - // ident.value.push_str(&s); - // } - // // If next token is period, then it is part of an ObjectName and we don't expect whitespace - // // after the number. - // !matches!(self.peek_token().token, Token::Period) - // } - // _ => { - // return self - // .expected("continuation of hyphenated identifier", token); - // } - // } - } - - // If the last segment was a number, we must check that it's followed by whitespace, - // otherwise foo-123a will be parsed as `foo-123` with the alias `a`. - if requires_whitespace { - let token = self.next_token(); - if !matches!(token.token, Token::EOF) { - return self - .expected("whitespace following hyphenated identifier", token); - } - } - } - Ok((ident, false)) - } + // Token::Word(w) => { + // let quote_style_is_none = w.quote_style.is_none(); + // let mut requires_whitespace = false; + // let mut ident = w.into_ident(self.next_token().span); + // if quote_style_is_none { + // while matches!(self.peek_token().token, Token::Minus) { + // unreachable!("Something went wrong in the tokenizer!"); + // // self.next_token(); + // // ident.value.push('-'); + + // // let token = self + // // .next_token_no_skip() + // // .cloned() + // // .unwrap_or(TokenWithSpan::wrap(Token::EOF)); + // // requires_whitespace = match token.token { + // // Token::Word(next_word) if next_word.quote_style.is_none() => { + // // ident.value.push_str(&next_word.value); + // // false + // // } + // // Token::Number(s, false) => { + // // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`. + // // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`. + // // // + // // // If a number token is followed by a period, it is part of an [ObjectName]. + // // // Return the identifier with `true` if the number token is followed by a period, indicating that + // // // parsing should continue for the next part of the hyphenated identifier. + // // if s.ends_with('.') { + // // let Some(s) = s.split('.').next().filter(|s| { + // // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()) + // // }) else { + // // return self.expected( + // // "continuation of hyphenated identifier", + // // TokenWithSpan::new(Token::Number(s, false), token.span), + // // ); + // // }; + // // ident.value.push_str(s); + // // return Ok((ident, true)); + // // } else { + // // ident.value.push_str(&s); + // // } + // // // If next token is period, then it is part of an ObjectName and we don't expect whitespace + // // // after the number. + // // !matches!(self.peek_token().token, Token::Period) + // // } + // // _ => { + // // return self + // // .expected("continuation of hyphenated identifier", token); + // // } + // // } + // } + + // // If the last segment was a number, we must check that it's followed by whitespace, + // // otherwise foo-123a will be parsed as `foo-123` with the alias `a`. + // if requires_whitespace { + // let token = self.next_token(); + // if !matches!(token.token, Token::EOF) { + // return self + // .expected("whitespace following hyphenated identifier", token); + // } + // } + // } + // Ok((ident, false)) + // } _ => Ok((self.parse_identifier()?, false)), } } @@ -18530,9 +18517,17 @@ mod tests { #[test] fn test_placeholder_invalid_whitespace() { - for w in [" ", " ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] { + for w in [ + " ", + "/*invalid*/", + "\n", + "\t\t", + "\r\n", + "--comment\n", + "/* multi\nline\ncomment */", + ] { let sql = format!("\nSELECT\n :{w}fooBar"); - assert!(Parser::parse_sql(&GenericDialect, &sql).is_err()); + assert!(Parser::parse_sql(&GenericDialect, &sql).is_err(), "Failed to error on when inserting the whitespace {w:?} within the placeholder SQL: `{sql}`"); } } } diff --git a/src/test_utils.rs b/src/test_utils.rs index a8c8afd59..978447d96 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -154,6 +154,7 @@ impl TestedDialects { /// /// For multiple statements, use [`statements_parse_to`]. pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement { + println!("Testing SQL: {}", sql); let mut statements = self.parse_sql_statements(sql).expect(sql); assert_eq!(statements.len(), 1); if !canonical.is_empty() && sql != canonical { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 82415e056..f49468fe3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -96,9 +96,6 @@ pub enum Token { /// Triple double quoted literal with raw string prefix. Example `R"""abc"""` /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) TripleDoubleQuotedRawStringLiteral(String), - /// An unquoted string literal containing dashes, i.e: 'first-second', - /// which is allowed in some BigQuery contexts - UnquotedDashStringLiteral(String), /// A CSV body from a `COPY ... FROM STDIN` statement CopyFromStdin(String), /// "National" string literal: i.e: N'string' @@ -306,7 +303,6 @@ impl fmt::Display for Token { Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""), Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"), Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""), - Token::UnquotedDashStringLiteral(ref s) => write!(f, "{s}"), Token::CopyFromStdin(ref s) => write!(f, "{s}\\."), Token::Comma => f.write_str(","), Token::DoubleEq => f.write_str("=="), @@ -400,9 +396,6 @@ impl Token { pub fn make_word>(word: S, quote_style: Option) -> Self { let word = word.into(); - if quote_style.is_none() && word.contains('-') { - return Token::UnquotedDashStringLiteral(word); - } let word_uppercase = word.to_uppercase(); Token::Word(Word { value: word, @@ -835,6 +828,7 @@ impl CopyStdinHandler { let mut last_character_was_cr = false; while let Some(ch) = state.next() { if last_character_was_cr && ch == '\\' && state.peek() == Some(&'.') { + state.next(); // consume the '.' break; } last_character_was_cr = ch == '\n'; @@ -937,6 +931,7 @@ impl<'a> Tokenizer<'a> { line: 1, col: 1, }; + let mut prev_keyword = None; let mut cs_handler = CopyStdinHandler::default(); let mut location = state.location(); @@ -944,8 +939,15 @@ impl<'a> Tokenizer<'a> { &mut location, &mut state, buf.last().map(|t| &t.token), + prev_keyword, false, )? { + if let Token::Word(Word { keyword, .. }) = &token { + if *keyword != Keyword::NoKeyword { + prev_keyword = Some(*keyword); + } + } + let span = location.span_to(state.location()); cs_handler.update(&token); buf.push(TokenWithSpan { token, span }); @@ -969,10 +971,11 @@ impl<'a> Tokenizer<'a> { &self, ch: impl IntoIterator, chars: &mut State, + prev_keyword: Option, ) -> Result, TokenizerError> { chars.next(); // consume the first char let ch: String = ch.into_iter().collect(); - let word = self.tokenize_word(ch, chars); + let word = self.tokenize_word(ch, chars, prev_keyword)?; // TODO: implement parsing of exponent here if word.chars().all(|x| x.is_ascii_digit() || x == '.') { @@ -996,7 +999,11 @@ impl<'a> Tokenizer<'a> { &self, chars: &State, prev_token: Option<&Token>, - ) -> Result, TokenizerError> { + preceded_by_whitespace: bool, + ) -> Result<(), TokenizerError> { + if !preceded_by_whitespace { + return Ok(()); + } if let Some(Token::Colon) = prev_token { return Err(TokenizerError { message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?" @@ -1004,7 +1011,7 @@ impl<'a> Tokenizer<'a> { location: chars.location(), }); } - Ok(None) + Ok(()) } /// Get the next token or return None @@ -1013,773 +1020,774 @@ impl<'a> Tokenizer<'a> { location: &mut Location, chars: &mut State, prev_token: Option<&Token>, + prev_keyword: Option, preceded_by_whitespace: bool, ) -> Result, TokenizerError> { - match chars.peek() { - Some(&ch) => match ch { - ' ' | '\t' | '\n' | '\r' => { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume - *location = chars.location(); - self.next_token(location, chars, prev_token, true) - } - // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings - b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) => - { - chars.next(); // consume - match chars.peek() { - Some('\'') => { - if self.dialect.supports_triple_quoted_string() { - return self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '\'', - false, - Token::SingleQuotedByteStringLiteral, - Token::TripleSingleQuotedByteStringLiteral, - ); - } - let s = self.tokenize_single_quoted_string(chars, '\'', false)?; - Ok(Some(Token::SingleQuotedByteStringLiteral(s))) - } - Some('\"') => { - if self.dialect.supports_triple_quoted_string() { - return self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '"', - false, - Token::DoubleQuotedByteStringLiteral, - Token::TripleDoubleQuotedByteStringLiteral, - ); - } - let s = self.tokenize_single_quoted_string(chars, '\"', false)?; - Ok(Some(Token::DoubleQuotedByteStringLiteral(s))) - } - _ => { - // regular identifier starting with an "b" or "B" - let s = self.tokenize_word(b, chars); - Ok(Some(Token::make_word(s, None))) + let Some(&ch) = chars.peek() else { + return Ok(None); + }; + match ch { + ' ' | '\t' | '\n' | '\r' => { + self.handle_colon_space_error( + chars, + prev_token, + preceded_by_whitespace || ch == '\n', + )?; + chars.next(); // consume + *location = chars.location(); + self.next_token(location, chars, prev_token, prev_keyword, true) + } + // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings + b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) => + { + chars.next(); // consume + match chars.peek() { + Some('\'') => { + if self.dialect.supports_triple_quoted_string() { + return self + .tokenize_single_or_triple_quoted_string:: Token>( + chars, + '\'', + false, + Token::SingleQuotedByteStringLiteral, + Token::TripleSingleQuotedByteStringLiteral, + ); } + let s = self.tokenize_single_quoted_string(chars, '\'', false)?; + Ok(Some(Token::SingleQuotedByteStringLiteral(s))) } - } - // BigQuery uses r or R for raw string literal - b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => { - chars.next(); // consume - match chars.peek() { - Some('\'') => self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '\'', - false, - Token::SingleQuotedRawStringLiteral, - Token::TripleSingleQuotedRawStringLiteral, - ), - Some('\"') => self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '"', - false, - Token::DoubleQuotedRawStringLiteral, - Token::TripleDoubleQuotedRawStringLiteral, - ), - _ => { - // regular identifier starting with an "r" or "R" - let s = self.tokenize_word(b, chars); - Ok(Some(Token::make_word(s, None))) + Some('\"') => { + if self.dialect.supports_triple_quoted_string() { + return self + .tokenize_single_or_triple_quoted_string:: Token>( + chars, + '"', + false, + Token::DoubleQuotedByteStringLiteral, + Token::TripleDoubleQuotedByteStringLiteral, + ); } + let s = self.tokenize_single_quoted_string(chars, '\"', false)?; + Ok(Some(Token::DoubleQuotedByteStringLiteral(s))) + } + _ => { + // regular identifier starting with an "b" or "B" + let s = self.tokenize_word(b, chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) } } - // Redshift uses lower case n for national string literal - n @ 'N' | n @ 'n' => { - chars.next(); // consume, to check the next char - match chars.peek() { - Some('\'') => { - // N'...' - a - let backslash_escape = - self.dialect.supports_string_literal_backslash_escape(); - let s = - self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?; - Ok(Some(Token::NationalStringLiteral(s))) - } - _ => { - // regular identifier starting with an "N" - let s = self.tokenize_word(n, chars); - Ok(Some(Token::make_word(s, None))) - } + } + // BigQuery uses r or R for raw string literal + b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => { + chars.next(); // consume + match chars.peek() { + Some('\'') => self + .tokenize_single_or_triple_quoted_string:: Token>( + chars, + '\'', + false, + Token::SingleQuotedRawStringLiteral, + Token::TripleSingleQuotedRawStringLiteral, + ), + Some('\"') => self + .tokenize_single_or_triple_quoted_string:: Token>( + chars, + '"', + false, + Token::DoubleQuotedRawStringLiteral, + Token::TripleDoubleQuotedRawStringLiteral, + ), + _ => { + // regular identifier starting with an "r" or "R" + let s = self.tokenize_word(b, chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) } } - // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. - x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => { - let starting_loc = chars.location(); - chars.next(); // consume, to check the next char - match chars.peek() { - Some('\'') => { - let s = - self.tokenize_escaped_single_quoted_string(starting_loc, chars)?; - Ok(Some(Token::EscapedStringLiteral(s))) - } - _ => { - // regular identifier starting with an "E" or "e" - let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(s, None))) - } + } + // Redshift uses lower case n for national string literal + n @ 'N' | n @ 'n' => { + chars.next(); // consume, to check the next char + match chars.peek() { + Some('\'') => { + // N'...' - a + let backslash_escape = + self.dialect.supports_string_literal_backslash_escape(); + let s = + self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?; + Ok(Some(Token::NationalStringLiteral(s))) + } + _ => { + // regular identifier starting with an "N" + let s = self.tokenize_word(n, chars, None)?; + Ok(Some(Token::make_word(s, None))) } } - // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL - x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => { - chars.next(); // consume, to check the next char - if chars.peek() == Some(&'&') { - // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier - let mut chars_clone = chars.peekable.clone(); - chars_clone.next(); // consume the '&' in the clone - if chars_clone.peek() == Some(&'\'') { - chars.next(); // consume the '&' in the original iterator - let s = unescape_unicode_single_quoted_string(chars)?; - return Ok(Some(Token::UnicodeStringLiteral(s))); - } + } + // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. + x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => { + let starting_loc = chars.location(); + chars.next(); // consume, to check the next char + match chars.peek() { + Some('\'') => { + let s = self.tokenize_escaped_single_quoted_string(starting_loc, chars)?; + Ok(Some(Token::EscapedStringLiteral(s))) + } + _ => { + // regular identifier starting with an "E" or "e" + let s = self.tokenize_word(x, chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) } - // regular identifier starting with an "U" or "u" - let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(s, None))) } - // The spec only allows an uppercase 'X' to introduce a hex - // string, but PostgreSQL, at least, allows a lowercase 'x' too. - x @ 'x' | x @ 'X' => { - chars.next(); // consume, to check the next char - match chars.peek() { - Some('\'') => { - // X'...' - a - let s = self.tokenize_single_quoted_string(chars, '\'', true)?; - Ok(Some(Token::HexStringLiteral(s))) - } - _ => { - // regular identifier starting with an "X" - let s = self.tokenize_word(x, chars); - Ok(Some(Token::make_word(s, None))) - } + } + // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL + x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => { + chars.next(); // consume, to check the next char + if chars.peek() == Some(&'&') { + // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier + let mut chars_clone = chars.peekable.clone(); + chars_clone.next(); // consume the '&' in the clone + if chars_clone.peek() == Some(&'\'') { + chars.next(); // consume the '&' in the original iterator + let s = unescape_unicode_single_quoted_string(chars)?; + return Ok(Some(Token::UnicodeStringLiteral(s))); } } - // single quoted string - '\'' => { - if self.dialect.supports_triple_quoted_string() { - return self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '\'', - self.dialect.supports_string_literal_backslash_escape(), - Token::SingleQuotedString, - Token::TripleSingleQuotedString, - ); + // regular identifier starting with an "U" or "u" + let s = self.tokenize_word(x, chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) + } + // The spec only allows an uppercase 'X' to introduce a hex + // string, but PostgreSQL, at least, allows a lowercase 'x' too. + x @ 'x' | x @ 'X' => { + chars.next(); // consume, to check the next char + match chars.peek() { + Some('\'') => { + // X'...' - a + let s = self.tokenize_single_quoted_string(chars, '\'', true)?; + Ok(Some(Token::HexStringLiteral(s))) + } + _ => { + // regular identifier starting with an "X" + let s = self.tokenize_word(x, chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) } - let s = self.tokenize_single_quoted_string( + } + } + // single quoted string + '\'' => { + if self.dialect.supports_triple_quoted_string() { + return self.tokenize_single_or_triple_quoted_string:: Token>( chars, '\'', self.dialect.supports_string_literal_backslash_escape(), - )?; - - Ok(Some(Token::SingleQuotedString(s))) + Token::SingleQuotedString, + Token::TripleSingleQuotedString, + ); } - // double quoted string - '\"' if !self.dialect.is_delimited_identifier_start(ch) - && !self.dialect.is_identifier_start(ch) => - { - if self.dialect.supports_triple_quoted_string() { - return self - .tokenize_single_or_triple_quoted_string:: Token>( - chars, - '"', - self.dialect.supports_string_literal_backslash_escape(), - Token::DoubleQuotedString, - Token::TripleDoubleQuotedString, - ); - } - let s = self.tokenize_single_quoted_string( + let s = self.tokenize_single_quoted_string( + chars, + '\'', + self.dialect.supports_string_literal_backslash_escape(), + )?; + + Ok(Some(Token::SingleQuotedString(s))) + } + // double quoted string + '\"' if !self.dialect.is_delimited_identifier_start(ch) + && !self.dialect.is_identifier_start(ch) => + { + if self.dialect.supports_triple_quoted_string() { + return self.tokenize_single_or_triple_quoted_string:: Token>( chars, '"', self.dialect.supports_string_literal_backslash_escape(), - )?; - - Ok(Some(Token::DoubleQuotedString(s))) - } - // delimited (quoted) identifier - quote_start if self.dialect.is_delimited_identifier_start(ch) => { - let word = self.tokenize_quoted_identifier(quote_start, chars)?; - Ok(Some(Token::make_word(word, Some(quote_start)))) + Token::DoubleQuotedString, + Token::TripleDoubleQuotedString, + ); } - // Potentially nested delimited (quoted) identifier - quote_start - if self - .dialect - .is_nested_delimited_identifier_start(quote_start) - && self - .dialect - .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) - .is_some() => - { - let Some((quote_start, nested_quote_start)) = self + let s = self.tokenize_single_quoted_string( + chars, + '"', + self.dialect.supports_string_literal_backslash_escape(), + )?; + + Ok(Some(Token::DoubleQuotedString(s))) + } + // delimited (quoted) identifier + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(word, Some(quote_start)))) + } + // Potentially nested delimited (quoted) identifier + quote_start + if self + .dialect + .is_nested_delimited_identifier_start(quote_start) + && self .dialect .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) - else { - return self.tokenizer_error( - chars.location(), - format!("Expected nested delimiter '{quote_start}' before EOF."), - ); - }; + .is_some() => + { + let Some((quote_start, nested_quote_start)) = self + .dialect + .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) + else { + return self.tokenizer_error( + chars.location(), + format!("Expected nested delimiter '{quote_start}' before EOF."), + ); + }; - let Some(nested_quote_start) = nested_quote_start else { - let word = self.tokenize_quoted_identifier(quote_start, chars)?; - return Ok(Some(Token::make_word(word, Some(quote_start)))); - }; + let Some(nested_quote_start) = nested_quote_start else { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + return Ok(Some(Token::make_word(word, Some(quote_start)))); + }; - let mut word = vec![]; - let quote_end = Word::matching_end_quote(quote_start); - let nested_quote_end = Word::matching_end_quote(nested_quote_start); - let error_loc = chars.location(); + let mut word = vec![]; + let quote_end = Word::matching_end_quote(quote_start); + let nested_quote_end = Word::matching_end_quote(nested_quote_start); + let error_loc = chars.location(); + + chars.next(); // skip the first delimiter + peeking_take_while(chars, |ch| ch.is_whitespace()); + if chars.peek() != Some(&nested_quote_start) { + return self.tokenizer_error( + error_loc, + format!("Expected nested delimiter '{nested_quote_start}' before EOF."), + ); + } + word.push(nested_quote_start.into()); + word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); + word.push(nested_quote_end.into()); + peeking_take_while(chars, |ch| ch.is_whitespace()); + if chars.peek() != Some("e_end) { + return self.tokenizer_error( + error_loc, + format!("Expected close delimiter '{quote_end}' before EOF."), + ); + } + chars.next(); // skip close delimiter - chars.next(); // skip the first delimiter - peeking_take_while(chars, |ch| ch.is_whitespace()); - if chars.peek() != Some(&nested_quote_start) { - return self.tokenizer_error( - error_loc, - format!("Expected nested delimiter '{nested_quote_start}' before EOF."), - ); - } - word.push(nested_quote_start.into()); - word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); - word.push(nested_quote_end.into()); - peeking_take_while(chars, |ch| ch.is_whitespace()); - if chars.peek() != Some("e_end) { - return self.tokenizer_error( - error_loc, - format!("Expected close delimiter '{quote_end}' before EOF."), - ); + Ok(Some(Token::make_word(word.concat(), Some(quote_start)))) + } + // numbers and period + '0'..='9' | '.' => { + // special case where if ._ is encountered after a word then that word + // is a table and the _ is the start of the col name. + // if the prev token is not a word, then this is not a valid sql + // word or number. + if ch == '.' && chars.peekable.clone().nth(1) == Some('_') { + if !preceded_by_whitespace + && !matches!(prev_token, Some(Token::Plus | Token::Minus)) + { + chars.next(); + return Ok(Some(Token::Period)); } - chars.next(); // skip close delimiter - Ok(Some(Token::make_word(word.concat(), Some(quote_start)))) + return self + .tokenizer_error(chars.location(), "Unexpected character '_'".to_string()); } - // numbers and period - '0'..='9' | '.' => { - // special case where if ._ is encountered after a word then that word - // is a table and the _ is the start of the col name. - // if the prev token is not a word, then this is not a valid sql - // word or number. - if ch == '.' && chars.peekable.clone().nth(1) == Some('_') { - if !preceded_by_whitespace { - chars.next(); - return Ok(Some(Token::Period)); - } - return self.tokenizer_error( - chars.location(), - "Unexpected character '_'".to_string(), - ); - } + // Some dialects support underscore as number separator + // There can only be one at a time and it must be followed by another digit + let is_number_separator = |ch: char, next_char: Option| { + self.dialect.supports_numeric_literal_underscores() + && ch == '_' + && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) + }; - // Some dialects support underscore as number separator - // There can only be one at a time and it must be followed by another digit - let is_number_separator = |ch: char, next_char: Option| { - self.dialect.supports_numeric_literal_underscores() - && ch == '_' - && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) - }; + let mut s = peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); - let mut s = peeking_next_take_while(chars, |ch, next_ch| { - ch.is_ascii_digit() || is_number_separator(ch, next_ch) + // match binary literal that starts with 0x + if s == "0" && chars.peek() == Some(&'x') { + chars.next(); + let s2 = peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch) }); + return Ok(Some(Token::HexStringLiteral(s2))); + } - // match binary literal that starts with 0x - if s == "0" && chars.peek() == Some(&'x') { - chars.next(); - let s2 = peeking_next_take_while(chars, |ch, next_ch| { - ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch) - }); - return Ok(Some(Token::HexStringLiteral(s2))); - } - - // match one period - if let Some('.') = chars.peek() { - s.push('.'); - chars.next(); - } - - // If the dialect supports identifiers that start with a numeric prefix - // and we have now consumed a dot, check if the previous token was a Word. - // If so, what follows is definitely not part of a decimal number and - // we should yield the dot as a dedicated token so compound identifiers - // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if !preceded_by_whitespace { - return Ok(Some(Token::Period)); - } - } - - // Consume fractional digits. - s += &peeking_next_take_while(chars, |ch, next_ch| { - ch.is_ascii_digit() || is_number_separator(ch, next_ch) - }); + // match one period + if let Some('.') = chars.peek() { + s.push('.'); + chars.next(); + } - // No fraction -> Token::Period - if s == "." { + // If the dialect supports identifiers that start with a numeric prefix + // and we have now consumed a dot, check if the previous token was a Word. + // If so, what follows is definitely not part of a decimal number and + // we should yield the dot as a dedicated token so compound identifiers + // starting with digits can be parsed correctly. + if s == "." && self.dialect.supports_numeric_prefix() { + if !preceded_by_whitespace + && !matches!(prev_token, Some(Token::Plus | Token::Minus)) + { return Ok(Some(Token::Period)); } + } - // Parse exponent as number - let mut exponent_part = String::new(); - if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { - let mut char_clone = chars.peekable.clone(); - exponent_part.push(char_clone.next().unwrap()); - - // Optional sign - match char_clone.peek() { - Some(&c) if matches!(c, '+' | '-') => { - exponent_part.push(c); - char_clone.next(); - } - _ => (), - } + // Consume fractional digits. + s += &peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); - match char_clone.peek() { - // Definitely an exponent, get original iterator up to speed and use it - Some(&c) if c.is_ascii_digit() => { - for _ in 0..exponent_part.len() { - chars.next(); - } - exponent_part += - &peeking_take_while(chars, |ch| ch.is_ascii_digit()); - s += exponent_part.as_str(); - } - // Not an exponent, discard the work done - _ => (), + // No fraction -> Token::Period + if s == "." { + return Ok(Some(Token::Period)); + } + + // Parse exponent as number + let mut exponent_part = String::new(); + if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { + let mut char_clone = chars.peekable.clone(); + exponent_part.push(char_clone.next().unwrap()); + + // Optional sign + match char_clone.peek() { + Some(&c) if matches!(c, '+' | '-') => { + exponent_part.push(c); + char_clone.next(); } + _ => (), } - // If the dialect supports identifiers that start with a numeric prefix, - // we need to check if the value is in fact an identifier and must thus - // be tokenized as a word. - if self.dialect.supports_numeric_prefix() { - if exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. - let word = - peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); - - if !word.is_empty() { - s += word.as_str(); - return Ok(Some(Token::make_word(s, None))); + match char_clone.peek() { + // Definitely an exponent, get original iterator up to speed and use it + Some(&c) if c.is_ascii_digit() => { + for _ in 0..exponent_part.len() { + chars.next(); } - } else if prev_token == Some(&Token::Period) { - // If the previous token was a period, thus not belonging to a number, - // the value we have is part of an identifier. - return Ok(Some(Token::make_word(s, None))); + exponent_part += &peeking_take_while(chars, |ch| ch.is_ascii_digit()); + s += exponent_part.as_str(); } + // Not an exponent, discard the work done + _ => (), } - - let long = if chars.peek() == Some(&'L') { - chars.next(); - true - } else { - false - }; - Ok(Some(Token::Number(s, long))) } - // punctuation - '(' => self.consume_and_return(chars, Token::LParen), - ')' => self.consume_and_return(chars, Token::RParen), - ',' => self.consume_and_return(chars, Token::Comma), - // operators - '-' => { - chars.next(); // consume the '-' - - match chars.peek() { - Some('-') => { - let mut is_comment = true; - if self.dialect.requires_single_line_comment_whitespace() { - is_comment = Some(' ') == chars.peekable.clone().nth(1); - } - - if is_comment { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume second '-' - // Consume the rest of the line as comment - let _comment = self.tokenize_single_line_comment(chars); - *location = chars.location(); - return self.next_token(location, chars, prev_token, true); - } - self.start_binop(chars, "-", Token::Minus) - } - Some('>') => { - chars.next(); - match chars.peek() { - Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow), - _ => self.start_binop(chars, "->", Token::Arrow), - } + // If the dialect supports identifiers that start with a numeric prefix, + // we need to check if the value is in fact an identifier and must thus + // be tokenized as a word. + if self.dialect.supports_numeric_prefix() { + if exponent_part.is_empty() { + // If it is not a number with an exponent, it may be + // an identifier starting with digits. + let word = + peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); + + if !word.is_empty() { + s += word.as_str(); + return Ok(Some(Token::make_word(s, None))); } - // a regular '-' operator - _ => self.start_binop(chars, "-", Token::Minus), + } else if prev_token == Some(&Token::Period) { + // If the previous token was a period, thus not belonging to a number, + // the value we have is part of an identifier. + return Ok(Some(Token::make_word(s, None))); } } - '/' => { - chars.next(); // consume the '/' - match chars.peek() { - Some('*') => { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume the '*', starting a multi-line comment - let _comment = self.consume_multiline_comment(chars)?; - *location = chars.location(); - self.next_token(location, chars, prev_token, true) + + let long = if chars.peek() == Some(&'L') { + chars.next(); + true + } else { + false + }; + Ok(Some(Token::Number(s, long))) + } + // punctuation + '(' => self.consume_and_return(chars, Token::LParen), + ')' => self.consume_and_return(chars, Token::RParen), + ',' => self.consume_and_return(chars, Token::Comma), + // operators + '-' => { + chars.next(); // consume the '-' + + match chars.peek() { + Some('-') => { + let mut is_comment = true; + if self.dialect.requires_single_line_comment_whitespace() { + is_comment = Some(' ') == chars.peekable.clone().nth(1); } - Some('/') if dialect_of!(self is SnowflakeDialect) => { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume the second '/', starting a snowflake single-line comment + + if is_comment { + self.handle_colon_space_error(chars, prev_token, true)?; + chars.next(); // consume second '-' // Consume the rest of the line as comment let _comment = self.tokenize_single_line_comment(chars); *location = chars.location(); - self.next_token(location, chars, prev_token, true) + return self.next_token( + location, + chars, + prev_token, + prev_keyword, + true, + ); } - Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => { - self.consume_and_return(chars, Token::DuckIntDiv) + + self.start_binop(chars, "-", Token::Minus) + } + Some('>') => { + chars.next(); + match chars.peek() { + Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow), + _ => self.start_binop(chars, "->", Token::Arrow), } - // a regular '/' operator - _ => Ok(Some(Token::Div)), } + // a regular '-' operator + _ => self.start_binop(chars, "-", Token::Minus), } - '+' => self.consume_and_return(chars, Token::Plus), - '*' => self.consume_and_return(chars, Token::Mul), - '%' => { - chars.next(); // advance past '%' - match chars.peek() { - Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), - Some(sch) if self.dialect.is_identifier_start('%') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) - } - _ => self.start_binop(chars, "%", Token::Mod), + } + '/' => { + chars.next(); // consume the '/' + match chars.peek() { + Some('*') => { + self.handle_colon_space_error(chars, prev_token, true)?; + chars.next(); // consume the '*', starting a multi-line comment + let _comment = self.consume_multiline_comment(chars)?; + *location = chars.location(); + self.next_token(location, chars, prev_token, prev_keyword, true) } + Some('/') if dialect_of!(self is SnowflakeDialect) => { + self.handle_colon_space_error(chars, prev_token, true)?; + chars.next(); // consume the second '/', starting a snowflake single-line comment + // Consume the rest of the line as comment + let _comment = self.tokenize_single_line_comment(chars); + *location = chars.location(); + self.next_token(location, chars, prev_token, prev_keyword, true) + } + Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => { + self.consume_and_return(chars, Token::DuckIntDiv) + } + // a regular '/' operator + _ => Ok(Some(Token::Div)), } - '|' => { - chars.next(); // consume the '|' - match chars.peek() { - Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot), - Some('|') => { - chars.next(); // consume the second '|' - match chars.peek() { - Some('/') => { - self.consume_for_binop(chars, "||/", Token::PGCubeRoot) - } - _ => self.start_binop(chars, "||", Token::StringConcat), - } + } + '+' => self.consume_and_return(chars, Token::Plus), + '*' => self.consume_and_return(chars, Token::Mul), + '%' => { + chars.next(); // advance past '%' + match chars.peek() { + Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), + Some(sch) if self.dialect.is_identifier_start('%') => { + self.tokenize_identifier_or_keyword([ch, *sch], chars, prev_keyword) + } + _ => self.start_binop(chars, "%", Token::Mod), + } + } + '|' => { + chars.next(); // consume the '|' + match chars.peek() { + Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot), + Some('|') => { + chars.next(); // consume the second '|' + match chars.peek() { + Some('/') => self.consume_for_binop(chars, "||/", Token::PGCubeRoot), + _ => self.start_binop(chars, "||", Token::StringConcat), } - Some('&') if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('>') => self.consume_for_binop( - chars, - "|&>", - Token::VerticalBarAmpersandRightAngleBracket, - ), - _ => self.start_binop_opt(chars, "|&", None), - } + } + Some('&') if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('>') => self.consume_for_binop( + chars, + "|&>", + Token::VerticalBarAmpersandRightAngleBracket, + ), + _ => self.start_binop_opt(chars, "|&", None), } - Some('>') if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('>') => self.consume_for_binop( - chars, - "|>>", - Token::VerticalBarShiftRight, - ), - _ => self.start_binop_opt(chars, "|>", None), + } + Some('>') if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('>') => { + self.consume_for_binop(chars, "|>>", Token::VerticalBarShiftRight) } + _ => self.start_binop_opt(chars, "|>", None), } - Some('>') if self.dialect.supports_pipe_operator() => { - self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket) - } - // Bitshift '|' operator - _ => self.start_binop(chars, "|", Token::Pipe), } - } - '=' => { - chars.next(); // consume - match chars.peek() { - Some('>') => self.consume_and_return(chars, Token::RArrow), - Some('=') => self.consume_and_return(chars, Token::DoubleEq), - _ => Ok(Some(Token::Eq)), + Some('>') if self.dialect.supports_pipe_operator() => { + self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket) } + // Bitshift '|' operator + _ => self.start_binop(chars, "|", Token::Pipe), } - '!' => { - chars.next(); // consume - match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::Neq), - Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark), - Some('~') => { - chars.next(); - match chars.peek() { - Some('*') => self - .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk), - Some('~') => { - chars.next(); - match chars.peek() { - Some('*') => self.consume_and_return( - chars, - Token::ExclamationMarkDoubleTildeAsterisk, - ), - _ => Ok(Some(Token::ExclamationMarkDoubleTilde)), - } - } - _ => Ok(Some(Token::ExclamationMarkTilde)), - } - } - _ => Ok(Some(Token::ExclamationMark)), - } + } + '=' => { + chars.next(); // consume + match chars.peek() { + Some('>') => self.consume_and_return(chars, Token::RArrow), + Some('=') => self.consume_and_return(chars, Token::DoubleEq), + _ => Ok(Some(Token::Eq)), } - '<' => { - chars.next(); // consume - match chars.peek() { - Some('=') => { - chars.next(); - match chars.peek() { - Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship), - _ => self.start_binop(chars, "<=", Token::LtEq), - } - } - Some('|') if self.dialect.supports_geometric_types() => { - self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar) - } - Some('>') => self.consume_for_binop(chars, "<>", Token::Neq), - Some('<') if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('|') => self.consume_for_binop( - chars, - "<<|", - Token::ShiftLeftVerticalBar, - ), - _ => self.start_binop(chars, "<<", Token::ShiftLeft), + } + '!' => { + chars.next(); // consume + match chars.peek() { + Some('=') => self.consume_and_return(chars, Token::Neq), + Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark), + Some('~') => { + chars.next(); + match chars.peek() { + Some('*') => { + self.consume_and_return(chars, Token::ExclamationMarkTildeAsterisk) } - } - Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft), - Some('-') if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('>') => { - self.consume_for_binop(chars, "<->", Token::TwoWayArrow) + Some('~') => { + chars.next(); + match chars.peek() { + Some('*') => self.consume_and_return( + chars, + Token::ExclamationMarkDoubleTildeAsterisk, + ), + _ => Ok(Some(Token::ExclamationMarkDoubleTilde)), } - _ => self.start_binop_opt(chars, "<-", None), } + _ => Ok(Some(Token::ExclamationMarkTilde)), } - Some('^') if self.dialect.supports_geometric_types() => { - self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret) - } - Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt), - _ => self.start_binop(chars, "<", Token::Lt), } + _ => Ok(Some(Token::ExclamationMark)), } - '>' => { - chars.next(); // consume - match chars.peek() { - Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq), - Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight), - Some('^') if self.dialect.supports_geometric_types() => { - self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret) + } + '<' => { + chars.next(); // consume + match chars.peek() { + Some('=') => { + chars.next(); + match chars.peek() { + Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship), + _ => self.start_binop(chars, "<=", Token::LtEq), } - _ => self.start_binop(chars, ">", Token::Gt), } - } - ':' => { - chars.next(); - match chars.peek() { - Some(':') => self.consume_and_return(chars, Token::DoubleColon), - Some('=') => self.consume_and_return(chars, Token::Assignment), - _ => Ok(Some(Token::Colon)), + Some('|') if self.dialect.supports_geometric_types() => { + self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar) } - } - ';' => self.consume_and_return(chars, Token::SemiColon), - '\\' => self.consume_and_return(chars, Token::Backslash), - '[' => self.consume_and_return(chars, Token::LBracket), - ']' => self.consume_and_return(chars, Token::RBracket), - '&' => { - chars.next(); // consume the '&' - match chars.peek() { - Some('>') if self.dialect.supports_geometric_types() => { - chars.next(); - self.consume_and_return(chars, Token::AmpersandRightAngleBracket) - } - Some('<') if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('|') => self.consume_and_return( - chars, - Token::AmpersandLeftAngleBracketVerticalBar, - ), - _ => { - self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket) - } + Some('>') => self.consume_for_binop(chars, "<>", Token::Neq), + Some('<') if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('|') => { + self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar) } + _ => self.start_binop(chars, "<<", Token::ShiftLeft), } - Some('&') => { - chars.next(); // consume the second '&' - self.start_binop(chars, "&&", Token::Overlap) + } + Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft), + Some('-') if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('>') => self.consume_for_binop(chars, "<->", Token::TwoWayArrow), + _ => self.start_binop_opt(chars, "<-", None), } - // Bitshift '&' operator - _ => self.start_binop(chars, "&", Token::Ampersand), } + Some('^') if self.dialect.supports_geometric_types() => { + self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret) + } + Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt), + _ => self.start_binop(chars, "<", Token::Lt), } - '^' => { - chars.next(); // consume the '^' - match chars.peek() { - Some('@') => self.consume_and_return(chars, Token::CaretAt), - _ => Ok(Some(Token::Caret)), + } + '>' => { + chars.next(); // consume + match chars.peek() { + Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq), + Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight), + Some('^') if self.dialect.supports_geometric_types() => { + self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret) } + _ => self.start_binop(chars, ">", Token::Gt), } - '{' => self.consume_and_return(chars, Token::LBrace), - '}' => self.consume_and_return(chars, Token::RBrace), - '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => - { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume the '#', starting a snowflake single-line comment - // Consume the rest of the line as comment - let _comment = self.tokenize_single_line_comment(chars); - *location = chars.location(); - self.next_token(location, chars, prev_token, true) + } + ':' => { + chars.next(); + match chars.peek() { + Some(':') => self.consume_and_return(chars, Token::DoubleColon), + Some('=') => self.consume_and_return(chars, Token::Assignment), + _ => Ok(Some(Token::Colon)), } - '~' => { - chars.next(); // consume - match chars.peek() { - Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk), - Some('=') if self.dialect.supports_geometric_types() => { - self.consume_for_binop(chars, "~=", Token::TildeEqual) - } - Some('~') => { - chars.next(); - match chars.peek() { - Some('*') => { - self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk) - } - _ => self.start_binop(chars, "~~", Token::DoubleTilde), - } + } + ';' => self.consume_and_return(chars, Token::SemiColon), + '\\' => self.consume_and_return(chars, Token::Backslash), + '[' => self.consume_and_return(chars, Token::LBracket), + ']' => self.consume_and_return(chars, Token::RBracket), + '&' => { + chars.next(); // consume the '&' + match chars.peek() { + Some('>') if self.dialect.supports_geometric_types() => { + chars.next(); + self.consume_and_return(chars, Token::AmpersandRightAngleBracket) + } + Some('<') if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('|') => self.consume_and_return( + chars, + Token::AmpersandLeftAngleBracketVerticalBar, + ), + _ => self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket), } - _ => self.start_binop(chars, "~", Token::Tilde), } + Some('&') => { + chars.next(); // consume the second '&' + self.start_binop(chars, "&&", Token::Overlap) + } + // Bitshift '&' operator + _ => self.start_binop(chars, "&", Token::Ampersand), } - '#' => { - chars.next(); - match chars.peek() { - Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus), - Some('>') => { - chars.next(); - match chars.peek() { - Some('>') => { - self.consume_for_binop(chars, "#>>", Token::HashLongArrow) - } - _ => self.start_binop(chars, "#>", Token::HashArrow), + } + '^' => { + chars.next(); // consume the '^' + match chars.peek() { + Some('@') => self.consume_and_return(chars, Token::CaretAt), + _ => Ok(Some(Token::Caret)), + } + } + '{' => self.consume_and_return(chars, Token::LBrace), + '}' => self.consume_and_return(chars, Token::RBrace), + '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => + { + self.handle_colon_space_error(chars, prev_token, true)?; + chars.next(); // consume the '#', starting a snowflake single-line comment + // Consume the rest of the line as comment + let _comment = self.tokenize_single_line_comment(chars); + *location = chars.location(); + self.next_token(location, chars, prev_token, prev_keyword, true) + } + '~' => { + chars.next(); // consume + match chars.peek() { + Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk), + Some('=') if self.dialect.supports_geometric_types() => { + self.consume_for_binop(chars, "~=", Token::TildeEqual) + } + Some('~') => { + chars.next(); + match chars.peek() { + Some('*') => { + self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk) } + _ => self.start_binop(chars, "~~", Token::DoubleTilde), } - Some(' ') => Ok(Some(Token::Sharp)), - Some('#') if self.dialect.supports_geometric_types() => { - self.consume_for_binop(chars, "##", Token::DoubleSharp) - } - Some(sch) if self.dialect.is_identifier_start('#') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) - } - _ => self.start_binop(chars, "#", Token::Sharp), } + _ => self.start_binop(chars, "~", Token::Tilde), } - '@' => { - chars.next(); - match chars.peek() { - Some('@') if self.dialect.supports_geometric_types() => { - self.consume_and_return(chars, Token::AtAt) - } - Some('-') if self.dialect.supports_geometric_types() => { - chars.next(); - match chars.peek() { - Some('@') => self.consume_and_return(chars, Token::AtDashAt), - _ => self.start_binop_opt(chars, "@-", None), - } - } - Some('>') => self.consume_and_return(chars, Token::AtArrow), - Some('?') => self.consume_and_return(chars, Token::AtQuestion), - Some('@') => { - chars.next(); - match chars.peek() { - Some(' ') => Ok(Some(Token::AtAt)), - Some(tch) if self.dialect.is_identifier_start('@') => { - self.tokenize_identifier_or_keyword([ch, '@', *tch], chars) - } - _ => Ok(Some(Token::AtAt)), - } - } - Some(' ') => Ok(Some(Token::AtSign)), - // We break on quotes here, because no dialect allows identifiers starting - // with @ and containing quotation marks (e.g. `@'foo'`) unless they are - // quoted, which is tokenized as a quoted string, not here (e.g. - // `"@'foo'"`). Further, at least two dialects parse `@` followed by a - // quoted string as two separate tokens, which this allows. For example, - // Postgres parses `@'1'` as the absolute value of '1' which is implicitly - // cast to a numeric type. And when parsing MySQL-style grantees (e.g. - // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens - // for the user, the `@`, and the host. - Some('\'') => Ok(Some(Token::AtSign)), - Some('\"') => Ok(Some(Token::AtSign)), - Some('`') => Ok(Some(Token::AtSign)), - Some(sch) if self.dialect.is_identifier_start('@') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) + } + '#' => { + chars.next(); + match chars.peek() { + Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus), + Some('>') => { + chars.next(); + match chars.peek() { + Some('>') => self.consume_for_binop(chars, "#>>", Token::HashLongArrow), + _ => self.start_binop(chars, "#>", Token::HashArrow), } - _ => Ok(Some(Token::AtSign)), } + Some(' ') => Ok(Some(Token::Sharp)), + Some('#') if self.dialect.supports_geometric_types() => { + self.consume_for_binop(chars, "##", Token::DoubleSharp) + } + Some(sch) if self.dialect.is_identifier_start('#') => { + self.tokenize_identifier_or_keyword([ch, *sch], chars, prev_keyword) + } + _ => self.start_binop(chars, "#", Token::Sharp), } - // Postgres uses ? for jsonb operators, not prepared statements - '?' if self.dialect.supports_geometric_types() => { - chars.next(); // consume - match chars.peek() { - Some('|') => { - chars.next(); - match chars.peek() { - Some('|') => self.consume_and_return( + } + '@' => { + chars.next(); + match chars.peek() { + Some('@') if self.dialect.supports_geometric_types() => { + self.consume_and_return(chars, Token::AtAt) + } + Some('-') if self.dialect.supports_geometric_types() => { + chars.next(); + match chars.peek() { + Some('@') => self.consume_and_return(chars, Token::AtDashAt), + _ => self.start_binop_opt(chars, "@-", None), + } + } + Some('>') => self.consume_and_return(chars, Token::AtArrow), + Some('?') => self.consume_and_return(chars, Token::AtQuestion), + Some('@') => { + chars.next(); + match chars.peek() { + Some(' ') => Ok(Some(Token::AtAt)), + Some(tch) if self.dialect.is_identifier_start('@') => self + .tokenize_identifier_or_keyword( + [ch, '@', *tch], chars, - Token::QuestionMarkDoubleVerticalBar, + prev_keyword, ), - _ => Ok(Some(Token::QuestionPipe)), + _ => Ok(Some(Token::AtAt)), + } + } + Some(' ') => Ok(Some(Token::AtSign)), + // We break on quotes here, because no dialect allows identifiers starting + // with @ and containing quotation marks (e.g. `@'foo'`) unless they are + // quoted, which is tokenized as a quoted string, not here (e.g. + // `"@'foo'"`). Further, at least two dialects parse `@` followed by a + // quoted string as two separate tokens, which this allows. For example, + // Postgres parses `@'1'` as the absolute value of '1' which is implicitly + // cast to a numeric type. And when parsing MySQL-style grantees (e.g. + // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens + // for the user, the `@`, and the host. + Some('\'') => Ok(Some(Token::AtSign)), + Some('\"') => Ok(Some(Token::AtSign)), + Some('`') => Ok(Some(Token::AtSign)), + Some(sch) if self.dialect.is_identifier_start('@') => { + self.tokenize_identifier_or_keyword([ch, *sch], chars, prev_keyword) + } + _ => Ok(Some(Token::AtSign)), + } + } + // Postgres uses ? for jsonb operators, not prepared statements + '?' if self.dialect.supports_geometric_types() => { + chars.next(); // consume + match chars.peek() { + Some('|') => { + chars.next(); + match chars.peek() { + Some('|') => { + self.consume_and_return(chars, Token::QuestionMarkDoubleVerticalBar) } + _ => Ok(Some(Token::QuestionPipe)), } + } - Some('&') => self.consume_and_return(chars, Token::QuestionAnd), - Some('-') => { - chars.next(); // consume - match chars.peek() { - Some('|') => self - .consume_and_return(chars, Token::QuestionMarkDashVerticalBar), - _ => Ok(Some(Token::QuestionMarkDash)), + Some('&') => self.consume_and_return(chars, Token::QuestionAnd), + Some('-') => { + chars.next(); // consume + match chars.peek() { + Some('|') => { + self.consume_and_return(chars, Token::QuestionMarkDashVerticalBar) } + _ => Ok(Some(Token::QuestionMarkDash)), } - Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp), - _ => self.consume_and_return(chars, Token::Question), } + Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp), + _ => self.consume_and_return(chars, Token::Question), } - '?' => { - chars.next(); - let s = peeking_take_while(chars, |ch| ch.is_numeric()); - Ok(Some(Token::Placeholder(String::from("?") + &s))) - } - - // identifier or keyword - ch if self.dialect.is_identifier_start(ch) => { - self.tokenize_identifier_or_keyword([ch], chars) - } - '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)), + } + '?' => { + chars.next(); + let s = peeking_take_while(chars, |ch| ch.is_numeric()); + Ok(Some(Token::Placeholder(String::from("?") + &s))) + } - // whitespace check (including unicode chars) should be last as it covers some of the chars above - ch if ch.is_whitespace() => { - self.handle_colon_space_error(chars, prev_token)?; - chars.next(); // consume - *location = chars.location(); - self.next_token(location, chars, prev_token, true) - } - other => self.consume_and_return(chars, Token::Char(other)), - }, - None => Ok(None), + // identifier or keyword + ch if self.dialect.is_identifier_start(ch) => { + self.tokenize_identifier_or_keyword([ch], chars, prev_keyword) + } + '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)), + + // whitespace check (including unicode chars) should be last as it covers some of the chars above + ch if ch.is_whitespace() => { + self.handle_colon_space_error(chars, prev_token, preceded_by_whitespace)?; + chars.next(); // consume + *location = chars.location(); + self.next_token(location, chars, prev_token, prev_keyword, true) + } + other => self.consume_and_return(chars, Token::Char(other)), } } @@ -1951,12 +1959,47 @@ impl<'a> Tokenizer<'a> { } /// Tokenize an identifier or keyword, after the first char is already consumed. - fn tokenize_word(&self, first_chars: impl Into, chars: &mut State) -> String { + fn tokenize_word( + &self, + first_chars: impl Into, + chars: &mut State, + prev_keyword: Option, + ) -> Result { let mut s = first_chars.into(); s.push_str(&peeking_take_while(chars, |ch| { - self.dialect.is_identifier_part(ch) || ch == '-' && self.dialect.is_identifier_part('-') + self.dialect.is_identifier_part(ch) })); - s + if !matches!(prev_keyword, Some(Keyword::SELECT)) + && self.dialect.supports_hyphenated_identifiers() + { + while chars.peek() == Some(&'-') { + chars.next(); // consume the '-' + let mut alphabetic_characters = false; + let mut new_identifier = String::new(); + new_identifier.push_str(&peeking_take_while(chars, |ch| { + alphabetic_characters |= ch.is_alphabetic(); + self.dialect.is_identifier_part(ch) + })); + + if let Some(ch) = new_identifier.chars().next() { + if ch.is_numeric() && alphabetic_characters { + return self.tokenizer_error( + chars.location(), + "Identifier cannot start with a digit and contain alphabetic characters after hyphen", + ); + } + } else { + // No characters after the hyphen, meaning it's not a valid identifier. + return self + .tokenizer_error(chars.location(), "Identifier cannot end with a hyphen"); + } + + s.push('-'); + s.push_str(&new_identifier); + } + } + + Ok(s) } /// Read a quoted identifier diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 99b7ac3fa..8aa99dcae 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -3589,6 +3589,7 @@ fn test_double_value() { for (input, expected) in test_cases { for (i, expr) in input.iter().enumerate() { + println!("Testing expression: {}", expr); if let Statement::Query(query) = dialects.one_statement_parses_to(&format!("SELECT {expr}"), "") { diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index bcc154287..bd337a96d 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1014,27 +1014,37 @@ fn parse_drop_schema_if_exists() { #[test] fn parse_copy_from_stdin() { - let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM stdin; -1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111 -2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222 -3 ED CHASE 2006-02-15 09:34:33 0.312323 -4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232 -5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343 -6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0 -7 GRACE MOSTEL 2006-02-15 09:34:33 6.0 -8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0 -9 JOE SWANK 2006-02-15 09:34:33 8.0 -10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1 -11 ZERO CAGE 2006-02-15 09:34:33 10.001 -12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001 -A Fateful Reflection of a Waitress And a Boat who must Discover a Sumo Wrestler in Ancient China -Kwara & Kogi -{"Deleted Scenes","Behind the Scenes"} -'awe':5 'awe-inspir':4 'barbarella':1 'cat':13 'conquer':16 'dog':18 'feminist':10 'inspir':6 'monasteri':21 'must':15 'stori':7 'streetcar':2 -PHP ₱ USD $ -\N Some other value -\\."#; - pg_and_generic().one_statement_parses_to(sql, ""); + let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN; +1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111 +2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222 +3 ED CHASE 2006-02-15 09:34:33 0.312323 +4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232 +5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343 +6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0 +7 GRACE MOSTEL 2006-02-15 09:34:33 6.0 +8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0 +9 JOE SWANK 2006-02-15 09:34:33 8.0 +10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1 +11 ZERO CAGE 2006-02-15 09:34:33 10.001 +12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001 +\."#; + pg_and_generic().verified_stmt(sql); + + let sql_comma_separated = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ','); +1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111 +2,NICK,WAHLBERG,2006-02-15 09:34:33,0.22222 +3,ED,CHASE,2006-02-15 09:34:33,0.312323 +4,JENNIFER,DAVIS,2006-02-15 09:34:33,0.3232 +5,JOHNNY,"LOLLO,BRIGIDA",2006-02-15 09:34:33,1.343 +6,BETTE,NICHOLSON,2006-02-15 09:34:33,5.0 +7,GRACE,MOSTEL,2006-02-15 09:34:33,6.0 +8,MATTHEW,JOHANSSON,2006-02-15 09:34:33,7.0 +9,JOE,SWANK,2006-02-15 09:34:33,8.0 +10,CHRISTIAN,GABLE,2006-02-15 09:34:33,9.1 +11,ZERO,CAGE,2006-02-15 09:34:33,10.001 +12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001 +\."#; + pg_and_generic().verified_stmt(sql_comma_separated); } #[test] From 819c0958d6f59120794fb9f15d6a910c659b53e2 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 10:16:19 +0100 Subject: [PATCH 06/10] Tentatively added support for path identifiers --- src/dialect/mod.rs | 5 +++++ src/dialect/snowflake.rs | 22 ++++++++++++++++++++-- src/tokenizer.rs | 24 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index abc8291d7..df19a598a 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -183,6 +183,11 @@ pub trait Dialect: Debug + Any { false } + /// Returns whether the dialect supports path-like identifiers + fn supports_path_like_identifiers(&self) -> bool { + false + } + /// Most dialects do not have custom operators. Override this method to provide custom operators. fn is_custom_operator_part(&self, _ch: char) -> bool { false diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 6b40125e3..ba370b34c 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -158,6 +158,10 @@ impl Dialect for SnowflakeDialect { || ch == '_' } + fn supports_path_like_identifiers(&self) -> bool { + true + } + // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences fn supports_string_literal_backslash_escape(&self) -> bool { true @@ -1067,8 +1071,22 @@ pub fn parse_stage_name_identifier(parser: &mut Parser) -> Result ident.push('+'), Token::Minus => ident.push('-'), Token::Number(n, _) => ident.push_str(n), - Token::Word(w) => ident.push_str(&w.to_string()), - _ => return parser.expected("stage name identifier", parser.peek_token()), + Token::Word(w) => { + if matches!(w.keyword, Keyword::NoKeyword) { + ident.push_str(w.to_string().as_str()); + } else { + parser.prev_token(); + break; + } + } + token => { + return { + println!( + "Unexpected token {token:?} while parsing stage name identifier {ident:?}" + ); + parser.expected("stage name identifier", parser.peek_token()) + } + } } } Ok(Ident::new(ident)) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f49468fe3..9b1094f8b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1669,6 +1669,11 @@ impl<'a> Tokenizer<'a> { _ => self.start_binop(chars, "~~", Token::DoubleTilde), } } + Some('/') if self.dialect.supports_path_like_identifiers() => { + // regular identifier starting with an "E" or "e" + let s = self.tokenize_word("~", chars, prev_keyword)?; + Ok(Some(Token::make_word(s, None))) + } _ => self.start_binop(chars, "~", Token::Tilde), } } @@ -1969,6 +1974,25 @@ impl<'a> Tokenizer<'a> { s.push_str(&peeking_take_while(chars, |ch| { self.dialect.is_identifier_part(ch) })); + + while !matches!(prev_keyword, Some(Keyword::SELECT)) + && self.dialect.supports_path_like_identifiers() + && chars.peek().map(|&ch| ch == '/').unwrap_or(false) + && chars + .peekable + .clone() + .nth(1) + .map(|ch| ch.is_alphabetic()) + .unwrap_or(false) + { + s.push('/'); + chars.next(); // consume the '/' + + s.push_str(&peeking_take_while(chars, |ch| { + self.dialect.is_identifier_part(ch) + })); + } + if !matches!(prev_keyword, Some(Keyword::SELECT)) && self.dialect.supports_hyphenated_identifiers() { From 7ea97462defecdf9b8100ffbb3e8943b00496904 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 10:37:32 +0100 Subject: [PATCH 07/10] Tentatively fixed snowflake ident in @-prefixed paths --- src/dialect/snowflake.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index ba370b34c..4e91bf8d3 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -1072,7 +1072,7 @@ pub fn parse_stage_name_identifier(parser: &mut Parser) -> Result ident.push('-'), Token::Number(n, _) => ident.push_str(n), Token::Word(w) => { - if matches!(w.keyword, Keyword::NoKeyword) { + if matches!(w.keyword, Keyword::NoKeyword) || ident.ends_with("@") { ident.push_str(w.to_string().as_str()); } else { parser.prev_token(); From c6c391c114987265d0f9cbeb8a3d9de12ea763b9 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 10:38:42 +0100 Subject: [PATCH 08/10] Fixed broken doc test --- src/tokenizer.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9b1094f8b..3fa46a48c 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -851,7 +851,7 @@ impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement /// /// ``` - /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer}; + /// # use sqlparser::tokenizer::{Token, Tokenizer}; /// # use sqlparser::dialect::GenericDialect; /// # let dialect = GenericDialect{}; /// let query = r#"SELECT 'foo'"#; @@ -861,7 +861,6 @@ impl<'a> Tokenizer<'a> { /// /// assert_eq!(tokens, vec![ /// Token::make_word("SELECT", None), - /// Token::Whitespace(Whitespace::Space), /// Token::SingleQuotedString("foo".to_string()), /// ]); pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { @@ -1673,7 +1672,7 @@ impl<'a> Tokenizer<'a> { // regular identifier starting with an "E" or "e" let s = self.tokenize_word("~", chars, prev_keyword)?; Ok(Some(Token::make_word(s, None))) - } + } _ => self.start_binop(chars, "~", Token::Tilde), } } From 5120d8c051122163b9a414d64b03a377a5f28076 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 11:22:07 +0100 Subject: [PATCH 09/10] Fixed code smells --- src/ast/ddl.rs | 2 +- src/ast/mod.rs | 9 +--- src/dialect/snowflake.rs | 9 +--- src/parser/mod.rs | 99 ++++------------------------------------ src/test_utils.rs | 1 - src/tokenizer.rs | 13 +++--- 6 files changed, 20 insertions(+), 113 deletions(-) diff --git a/src/ast/ddl.rs b/src/ast/ddl.rs index fd481213f..5b74e65be 100644 --- a/src/ast/ddl.rs +++ b/src/ast/ddl.rs @@ -19,7 +19,7 @@ //! (commonly referred to as Data Definition Language, or DDL) #[cfg(not(feature = "std"))] -use alloc::{boxed::Box, format, string::String, vec, vec::Vec}; +use alloc::{boxed::Box, format, string::String, vec::Vec}; use core::fmt::{self, Display, Write}; #[cfg(feature = "serde")] diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 6ddf32819..2e4898a38 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -2787,10 +2787,11 @@ impl fmt::Display for Declare { } /// Sql options of a `CREATE TABLE` statement. -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] pub enum CreateTableOptions { + #[default] None, /// Options specified using the `WITH` keyword. /// e.g. `WITH (description = "123")` @@ -2819,12 +2820,6 @@ pub enum CreateTableOptions { TableProperties(Vec), } -impl Default for CreateTableOptions { - fn default() -> Self { - Self::None - } -} - impl fmt::Display for CreateTableOptions { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 4e91bf8d3..91d041757 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -1079,14 +1079,7 @@ pub fn parse_stage_name_identifier(parser: &mut Parser) -> Result { - return { - println!( - "Unexpected token {token:?} while parsing stage name identifier {ident:?}" - ); - parser.expected("stage name identifier", parser.peek_token()) - } - } + _ => return parser.expected("stage name identifier", parser.peek_token()), } } Ok(Ident::new(ident)) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 90c52bb87..6225681fb 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -4031,13 +4031,13 @@ impl<'a> Parser<'a> { /// See [`Self::peek_token`] for an example. pub fn peek_tokens_with_location(&self) -> [TokenWithSpan; N] { let mut index = self.index; - core::array::from_fn(|_| loop { + core::array::from_fn(|_| { let token = self.tokens.get(index); index += 1; - break token.cloned().unwrap_or(TokenWithSpan { + token.cloned().unwrap_or(TokenWithSpan { token: Token::EOF, span: Span::empty(), - }); + }) }) } @@ -4047,10 +4047,10 @@ impl<'a> Parser<'a> { /// See [`Self::peek_tokens`] for an example. pub fn peek_tokens_ref(&self) -> [&TokenWithSpan; N] { let mut index = self.index; - core::array::from_fn(|_| loop { + core::array::from_fn(|_| { let token = self.tokens.get(index); index += 1; - break token.unwrap_or(&EOF_TOKEN); + token.unwrap_or(&EOF_TOKEN) }) } @@ -8546,7 +8546,7 @@ impl<'a> Parser<'a> { return self.expected( "FULLTEXT or SPATIAL option without constraint name", TokenWithSpan { - token: Token::make_keyword(&name.to_string()), + token: Token::make_keyword(name.to_string()), span: next_token.span, }, ); @@ -11125,9 +11125,9 @@ impl<'a> Parser<'a> { let mut parts = vec![]; if dialect_of!(self is BigQueryDialect) && in_table_clause { loop { - let (ident, end_with_period) = self.parse_unquoted_hyphenated_identifier()?; + let ident = self.parse_identifier()?; parts.push(ObjectNamePart::Identifier(ident)); - if !self.consume_token(&Token::Period) && !end_with_period { + if !self.consume_token(&Token::Period) { break; } } @@ -11141,9 +11141,9 @@ impl<'a> Parser<'a> { span, })); } else if dialect_of!(self is BigQueryDialect) && in_table_clause { - let (ident, end_with_period) = self.parse_unquoted_hyphenated_identifier()?; + let ident = self.parse_identifier()?; parts.push(ObjectNamePart::Identifier(ident)); - if !self.consume_token(&Token::Period) && !end_with_period { + if !self.consume_token(&Token::Period) { break; } } else if self.dialect.supports_object_name_double_dot_notation() @@ -11322,85 +11322,6 @@ impl<'a> Parser<'a> { } } - /// On BigQuery, hyphens are permitted in unquoted identifiers inside of a FROM or - /// TABLE clause. - /// - /// The first segment must be an ordinary unquoted identifier, e.g. it must not start - /// with a digit. Subsequent segments are either must either be valid identifiers or - /// integers, e.g. foo-123 is allowed, but foo-123a is not. - /// - /// [BigQuery-lexical](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical) - /// - /// Return a tuple of the identifier and a boolean indicating it ends with a period. - fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> { - match self.peek_token().token { - // Token::Word(w) => { - // let quote_style_is_none = w.quote_style.is_none(); - // let mut requires_whitespace = false; - // let mut ident = w.into_ident(self.next_token().span); - // if quote_style_is_none { - // while matches!(self.peek_token().token, Token::Minus) { - // unreachable!("Something went wrong in the tokenizer!"); - // // self.next_token(); - // // ident.value.push('-'); - - // // let token = self - // // .next_token_no_skip() - // // .cloned() - // // .unwrap_or(TokenWithSpan::wrap(Token::EOF)); - // // requires_whitespace = match token.token { - // // Token::Word(next_word) if next_word.quote_style.is_none() => { - // // ident.value.push_str(&next_word.value); - // // false - // // } - // // Token::Number(s, false) => { - // // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`. - // // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`. - // // // - // // // If a number token is followed by a period, it is part of an [ObjectName]. - // // // Return the identifier with `true` if the number token is followed by a period, indicating that - // // // parsing should continue for the next part of the hyphenated identifier. - // // if s.ends_with('.') { - // // let Some(s) = s.split('.').next().filter(|s| { - // // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()) - // // }) else { - // // return self.expected( - // // "continuation of hyphenated identifier", - // // TokenWithSpan::new(Token::Number(s, false), token.span), - // // ); - // // }; - // // ident.value.push_str(s); - // // return Ok((ident, true)); - // // } else { - // // ident.value.push_str(&s); - // // } - // // // If next token is period, then it is part of an ObjectName and we don't expect whitespace - // // // after the number. - // // !matches!(self.peek_token().token, Token::Period) - // // } - // // _ => { - // // return self - // // .expected("continuation of hyphenated identifier", token); - // // } - // // } - // } - - // // If the last segment was a number, we must check that it's followed by whitespace, - // // otherwise foo-123a will be parsed as `foo-123` with the alias `a`. - // if requires_whitespace { - // let token = self.next_token(); - // if !matches!(token.token, Token::EOF) { - // return self - // .expected("whitespace following hyphenated identifier", token); - // } - // } - // } - // Ok((ident, false)) - // } - _ => Ok((self.parse_identifier()?, false)), - } - } - /// Parses a parenthesized, comma-separated list of column definitions within a view. fn parse_view_columns(&mut self) -> Result, ParserError> { if self.consume_token(&Token::LParen) { diff --git a/src/test_utils.rs b/src/test_utils.rs index 978447d96..a8c8afd59 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -154,7 +154,6 @@ impl TestedDialects { /// /// For multiple statements, use [`statements_parse_to`]. pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement { - println!("Testing SQL: {}", sql); let mut statements = self.parse_sql_statements(sql).expect(sql); assert_eq!(statements.len(), 1); if !canonical.is_empty() && sql != canonical { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3fa46a48c..b5d7a67f2 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -23,7 +23,6 @@ #[cfg(not(feature = "std"))] use alloc::{ - borrow::ToOwned, format, string::{String, ToString}, vec, @@ -1319,12 +1318,12 @@ impl<'a> Tokenizer<'a> { // If so, what follows is definitely not part of a decimal number and // we should yield the dot as a dedicated token so compound identifiers // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if !preceded_by_whitespace - && !matches!(prev_token, Some(Token::Plus | Token::Minus)) - { - return Ok(Some(Token::Period)); - } + if s == "." + && self.dialect.supports_numeric_prefix() + && !preceded_by_whitespace + && !matches!(prev_token, Some(Token::Plus | Token::Minus)) + { + return Ok(Some(Token::Period)); } // Consume fractional digits. From 07a828f661df0384e42c7823417ddb59e0da12db Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 29 Oct 2025 18:42:41 +0100 Subject: [PATCH 10/10] Replaced CSV with custom csv parser --- Cargo.toml | 1 - src/ast/mod.rs | 68 ++++++++++------ src/parser/mod.rs | 149 +++++++++++++++++++++++++++--------- tests/sqlparser_postgres.rs | 24 ++++++ 4 files changed, 182 insertions(+), 60 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 005cb4567..ed94bbbdd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,7 +47,6 @@ visitor = ["sqlparser_derive"] [dependencies] bigdecimal = { version = "0.4.1", features = ["serde"], optional = true } log = "0.4" -csv = "1.4.0" recursive = { version = "0.1.1", optional = true} serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 2e4898a38..0b4402386 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -4576,19 +4576,21 @@ impl fmt::Display for Statement { } let mut null_symbol = "\\N"; - let mut writer_builder = csv::WriterBuilder::new(); + let mut delimiter = '\t'; + let mut quote = '"'; + let mut escape = '\\'; // Apply options for option in options { match option { CopyOption::Delimiter(c) => { - writer_builder.delimiter(*c as u8); + delimiter = *c; } CopyOption::Quote(c) => { - writer_builder.quote(*c as u8); + quote = *c; } CopyOption::Escape(c) => { - writer_builder.escape(*c as u8); + escape = *c; } CopyOption::Null(null) => { null_symbol = null; @@ -4601,10 +4603,7 @@ impl fmt::Display for Statement { for option in legacy_options { match option { CopyLegacyOption::Delimiter(c) => { - writer_builder.delimiter(*c as u8); - } - CopyLegacyOption::Header => { - writer_builder.has_headers(true); + delimiter = *c; } CopyLegacyOption::Null(null) => { null_symbol = null; @@ -4612,14 +4611,11 @@ impl fmt::Display for Statement { CopyLegacyOption::Csv(csv_options) => { for csv_option in csv_options { match csv_option { - CopyLegacyCsvOption::Header => { - writer_builder.has_headers(true); - } CopyLegacyCsvOption::Quote(c) => { - writer_builder.quote(*c as u8); + quote = *c; } CopyLegacyCsvOption::Escape(c) => { - writer_builder.escape(*c as u8); + escape = *c; } _ => {} } @@ -4631,19 +4627,43 @@ impl fmt::Display for Statement { if !values.is_empty() { writeln!(f, ";")?; - let mut writer = writer_builder.from_writer(vec![]); + + // Simple CSV writer for row in values { - writer - .write_record( - row.iter() - .map(|column| column.as_deref().unwrap_or(null_symbol)), - ) - .map_err(|_| fmt::Error)? + for (idx, column) in row.iter().enumerate() { + if idx > 0 { + write!(f, "{}", delimiter)?; + } + + let field_value = column.as_deref().unwrap_or(null_symbol); + + // Check if field needs quoting + let needs_quoting = field_value.contains(delimiter) + || field_value.contains(quote) + || field_value.contains('\n') + || field_value.contains('\r'); + + if needs_quoting { + write!(f, "{}", quote)?; + for ch in field_value.chars() { + if ch == quote { + // Escape quote by doubling it + write!(f, "{}{}", quote, quote)?; + } else if ch == escape { + // Escape escape character + write!(f, "{}{}", escape, escape)?; + } else { + write!(f, "{}", ch)?; + } + } + write!(f, "{}", quote)?; + } else { + write!(f, "{}", field_value)?; + } + } + writeln!(f)?; } - writer.flush().map_err(|_| fmt::Error)?; - let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?) - .map_err(|_| fmt::Error)?; - write!(f, "{}", data)?; + write!(f, "\\.")?; } Ok(()) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6225681fb..9feb2a776 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -9542,25 +9542,22 @@ impl<'a> Parser<'a> { return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token()); }; - let mut reader_builder = csv::ReaderBuilder::new(); - reader_builder.has_headers(false); - + let mut delimiter = '\t'; + let mut quote = '"'; + let mut escape = '\\'; let mut null_symbol = "\\N"; // Apply options for option in options { match option { CopyOption::Delimiter(c) => { - reader_builder.delimiter(*c as u8); - } - CopyOption::Header(has_header) => { - reader_builder.has_headers(*has_header); + delimiter = *c; } CopyOption::Quote(c) => { - reader_builder.quote(*c as u8); + quote = *c; } CopyOption::Escape(c) => { - reader_builder.escape(Some(*c as u8)); + escape = *c; } CopyOption::Null(null) => { null_symbol = null; @@ -9573,10 +9570,7 @@ impl<'a> Parser<'a> { for option in legacy_options { match option { CopyLegacyOption::Delimiter(c) => { - reader_builder.delimiter(*c as u8); - } - CopyLegacyOption::Header => { - reader_builder.has_headers(true); + delimiter = *c; } CopyLegacyOption::Null(null) => { null_symbol = null; @@ -9584,14 +9578,11 @@ impl<'a> Parser<'a> { CopyLegacyOption::Csv(csv_options) => { for csv_option in csv_options { match csv_option { - CopyLegacyCsvOption::Header => { - reader_builder.has_headers(true); - } CopyLegacyCsvOption::Quote(c) => { - reader_builder.quote(*c as u8); + quote = *c; } CopyLegacyCsvOption::Escape(c) => { - reader_builder.escape(Some(*c as u8)); + escape = *c; } _ => {} } @@ -9601,28 +9592,116 @@ impl<'a> Parser<'a> { } } + // Simple CSV parser let mut result = vec![]; - let mut reader = reader_builder.from_reader(body.as_bytes()); - for record in reader.records() { - let record = match record { - Ok(rec) => rec, - Err(e) => { - return Err(ParserError::ParserError(format!( - "Error parsing CSV data: {}", - e - ))) - } - }; - let mut row = vec![]; - for field in record.iter() { - if field == null_symbol { - row.push(None); + let mut current_row = vec![]; + let mut current_field = String::new(); + let mut in_quotes = false; + let mut chars = body.chars().peekable(); + let mut expected_column_count: Option = None; + let mut row_number = 0; + + while let Some(ch) = chars.next() { + if in_quotes { + if ch == quote { + // Check if it's an escaped quote + if let Some(&next_ch) = chars.peek() { + if next_ch == quote { + // Escaped quote + current_field.push(quote); + chars.next(); + } else { + // End of quoted field + in_quotes = false; + } + } else { + // End of quoted field at end of input + in_quotes = false; + } + } else if ch == escape { + // Escape character + if let Some(next_ch) = chars.next() { + current_field.push(next_ch); + } + } else { + current_field.push(ch); + } + } else if ch == quote { + in_quotes = true; + } else if ch == delimiter { + // End of field + if current_field == null_symbol { + current_row.push(None); } else { - row.push(Some(field.to_string())); + current_row.push(Some(current_field.clone())); + } + current_field.clear(); + } else if ch == '\n' || ch == '\r' { + // End of record + if ch == '\r' { + // Skip \n if it follows \r + if let Some(&'\n') = chars.peek() { + chars.next(); + } + } + if !current_field.is_empty() || !current_row.is_empty() { + if current_field == null_symbol { + current_row.push(None); + } else { + current_row.push(Some(current_field.clone())); + } + current_field.clear(); + + // Validate column count + row_number += 1; + if let Some(expected) = expected_column_count { + if current_row.len() != expected { + return Err(ParserError::ParserError(format!( + "CSV row {} has {} columns, but expected {} columns based on first row", + row_number, + current_row.len(), + expected + ))); + } + } else { + // First row establishes the expected column count + expected_column_count = Some(current_row.len()); + } + + result.push(current_row.clone()); + current_row.clear(); + } + } else { + current_field.push(ch); + } + } + + // Handle remaining field/row + if !current_field.is_empty() || !current_row.is_empty() { + if current_field == null_symbol { + current_row.push(None); + } else { + current_row.push(Some(current_field)); + } + + // Validate column count for last row + row_number += 1; + if let Some(expected) = expected_column_count { + if current_row.len() != expected { + return Err(ParserError::ParserError(format!( + "CSV row {} has {} columns, but expected {} columns based on first row", + row_number, + current_row.len(), + expected + ))); } } - result.push(row); + // Note: if this is the first and only row, we don't need to set expected_column_count + // since there's nothing to validate against + + result.push(current_row); } + Ok(result) } diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index bd337a96d..bfdbabb1c 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1045,6 +1045,30 @@ fn parse_copy_from_stdin() { 12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001 \."#; pg_and_generic().verified_stmt(sql_comma_separated); + + let incorrect_csv_sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ','); +1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111 +2,NICK,WAHLBERG,2006-02-15 09:34:33 +\."#; + let parsed = pg_and_generic().parse_sql_statements(incorrect_csv_sql); + assert_eq!( + parsed.unwrap_err(), + ParserError::ParserError( + "CSV row 2 has 4 columns, but expected 5 columns based on first row".to_string() + ) + ); + + let mixed_incorrect_separators = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ','); +1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111 +2 NICK WAHLBERG 2006-02-15 09:34:33,0.22222 +\."#; + let parsed = pg_and_generic().parse_sql_statements(mixed_incorrect_separators); + assert_eq!( + parsed.unwrap_err(), + ParserError::ParserError( + "CSV row 2 has 2 columns, but expected 5 columns based on first row".to_string() + ) + ); } #[test]