From 4f1f91c5c166a2f15ce6e7c499a810c04e62e7e3 Mon Sep 17 00:00:00 2001 From: Mykhailo Bondarenko <70747718+michael-2956@users.noreply.github.com> Date: Sat, 25 Jun 2022 20:17:32 +0300 Subject: [PATCH 1/3] Count characters instead of bytes --- src/tokenizer.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 91cb16a80..1f1ec6d8f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -351,11 +351,11 @@ impl<'a> Tokenizer<'a> { } Token::Whitespace(Whitespace::Tab) => self.col += 4, - Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64, - Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2, - Token::Number(s, _) => self.col += s.len() as u64, - Token::SingleQuotedString(s) => self.col += s.len() as u64, - Token::Placeholder(s) => self.col += s.len() as u64, + Token::Word(w) if w.quote_style == None => self.col += w.value.chars().count() as u64, + Token::Word(w) if w.quote_style != None => self.col += w.value.chars().count() as u64 + 2, + Token::Number(s, _) => self.col += s.chars().count() as u64, + Token::SingleQuotedString(s) => self.col += s.chars().count() as u64, + Token::Placeholder(s) => self.col += s.chars().count() as u64, _ => self.col += 1, } From f668e2522fe2c52cdd7ecd52bfd822c1d1ac7d7b Mon Sep 17 00:00:00 2001 From: michael-2956 Date: Thu, 7 Jul 2022 16:22:43 +0300 Subject: [PATCH 2/3] cargo fmt --- src/tokenizer.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1f1ec6d8f..5cd95af12 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -351,8 +351,12 @@ impl<'a> Tokenizer<'a> { } Token::Whitespace(Whitespace::Tab) => self.col += 4, - Token::Word(w) if w.quote_style == None => self.col += w.value.chars().count() as u64, - Token::Word(w) if w.quote_style != None => self.col += w.value.chars().count() as u64 + 2, + Token::Word(w) if w.quote_style == None => { + self.col += w.value.chars().count() as u64 + } + Token::Word(w) if w.quote_style != None => { + self.col += w.value.chars().count() as u64 + 2 + } Token::Number(s, _) => self.col += s.chars().count() as u64, Token::SingleQuotedString(s) => self.col += s.chars().count() as u64, Token::Placeholder(s) => self.col += s.chars().count() as u64, From 6d32cca36a9c3576ccac0b94817d68d93c883372 Mon Sep 17 00:00:00 2001 From: michael-2956 Date: Thu, 7 Jul 2022 16:29:44 +0300 Subject: [PATCH 3/3] add tests to PR #529 --- src/tokenizer.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 5cd95af12..73c41f82b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1212,6 +1212,22 @@ mod tests { ); } + #[test] + fn tokenize_unterminated_string_literal_utf8() { + let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;"); + + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); + assert_eq!( + tokenizer.tokenize(), + Err(TokenizerError { + message: "Unterminated string literal".to_string(), + line: 1, + col: 35 + }) + ); + } + #[test] fn tokenize_invalid_string_cols() { let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");