Skip to content

Commit f8c666c

Browse files
committed
Implement zero-copy tokenization for Word, SingleQuotedString, and Whitespace
Convert token string fields to use Cow<'a, str> to enable zero-copy tokenization for commonly used tokens: - Word.value: Regular identifiers and keywords now borrow from source - SingleQuotedString: String literals borrow when no escape processing needed - Whitespace: Single-line and multi-line comments borrow from source Also created a benchmark example (examples/benchmark_test.rs) to help measure the performance impact of this commit using dhat for allocation profiling.
1 parent 9d5f00b commit f8c666c

File tree

8 files changed

+353
-182
lines changed

8 files changed

+353
-182
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,7 @@ Cargo.lock
1818

1919
*.swp
2020

21-
.DS_store
21+
.DS_store
22+
23+
# dhat profiler output files
24+
dhat*.json

Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@ sqlparser_derive = { version = "0.4.0", path = "derive", optional = true }
6060
simple_logger = "5.0"
6161
matches = "0.1"
6262
pretty_assertions = "1"
63+
sysinfo = "0.30"
64+
dhat = "0.3.3"
6365

6466
[package.metadata.docs.rs]
6567
# Document these features on docs.rs
66-
features = ["serde", "visitor"]
68+
features = ["serde", "visitor"]
69+
70+

src/dialect/snowflake.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result<Statement, ParserError> {
12511251
continue_loop = false;
12521252
let next_token = parser.next_token();
12531253
match next_token.token {
1254-
BorrowedToken::SingleQuotedString(s) => files.push(s),
1254+
BorrowedToken::SingleQuotedString(s) => files.push(s.into_owned()),
12551255
_ => parser.expected("file token", next_token)?,
12561256
};
12571257
if parser.next_token().token.eq(&BorrowedToken::Comma) {
@@ -1266,7 +1266,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result<Statement, ParserError> {
12661266
parser.expect_token(&BorrowedToken::Eq)?;
12671267
let next_token = parser.next_token();
12681268
pattern = Some(match next_token.token {
1269-
BorrowedToken::SingleQuotedString(s) => s,
1269+
BorrowedToken::SingleQuotedString(s) => s.into_owned(),
12701270
_ => parser.expected("pattern", next_token)?,
12711271
});
12721272
// VALIDATION MODE
@@ -1417,7 +1417,7 @@ fn parse_stage_params(parser: &Parser) -> Result<StageParamsObject, ParserError>
14171417
if parser.parse_keyword(Keyword::URL) {
14181418
parser.expect_token(&BorrowedToken::Eq)?;
14191419
url = Some(match parser.next_token().token {
1420-
BorrowedToken::SingleQuotedString(word) => Ok(word),
1420+
BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()),
14211421
_ => parser.expected("a URL statement", parser.peek_token()),
14221422
}?)
14231423
}
@@ -1432,7 +1432,7 @@ fn parse_stage_params(parser: &Parser) -> Result<StageParamsObject, ParserError>
14321432
if parser.parse_keyword(Keyword::ENDPOINT) {
14331433
parser.expect_token(&BorrowedToken::Eq)?;
14341434
endpoint = Some(match parser.next_token().token {
1435-
BorrowedToken::SingleQuotedString(word) => Ok(word),
1435+
BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()),
14361436
_ => parser.expected("an endpoint statement", parser.peek_token()),
14371437
}?)
14381438
}
@@ -1486,7 +1486,7 @@ fn parse_session_options(parser: &Parser, set: bool) -> Result<Vec<KeyValueOptio
14861486
options.push(option);
14871487
} else {
14881488
options.push(KeyValueOption {
1489-
option_name: key.value,
1489+
option_name: key.value.to_string(),
14901490
option_value: KeyValueOptionKind::Single(Value::Placeholder(empty())),
14911491
});
14921492
}

src/parser/mod.rs

Lines changed: 74 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#[cfg(not(feature = "std"))]
1616
use alloc::{
17+
borrow::Cow,
1718
boxed::Box,
1819
format,
1920
string::{String, ToString},
@@ -25,6 +26,8 @@ use core::{
2526
str::FromStr,
2627
};
2728
use helpers::attached_token::AttachedToken;
29+
#[cfg(feature = "std")]
30+
use std::borrow::Cow;
2831

2932
use log::debug;
3033

@@ -1794,7 +1797,7 @@ impl<'a> Parser<'a> {
17941797
}
17951798
BorrowedToken::SingleQuotedString(s) => {
17961799
let expr =
1797-
Expr::Identifier(Ident::with_quote_and_span('\'', next_token.span, s));
1800+
Expr::Identifier(Ident::with_quote_and_span('\'', next_token.span, s.as_ref()));
17981801
chain.push(AccessExpr::Dot(expr));
17991802
self.advance_token(); // The consumed string
18001803
}
@@ -3893,7 +3896,7 @@ impl<'a> Parser<'a> {
38933896
// any keyword here unquoted.
38943897
keyword: _,
38953898
}) => Ok(JsonPathElem::Dot {
3896-
key: value,
3899+
key: value.to_string(),
38973900
quoted: quote_style.is_some(),
38983901
}),
38993902

@@ -7744,7 +7747,7 @@ impl<'a> Parser<'a> {
77447747
if dialect_of!(self is HiveDialect) && self.parse_keyword(Keyword::COMMENT) {
77457748
let next_token = self.next_token();
77467749
match next_token.token {
7747-
BorrowedToken::SingleQuotedString(str) => Some(CommentDef::WithoutEq(str)),
7750+
BorrowedToken::SingleQuotedString(str) => Some(CommentDef::WithoutEq(str.into_owned())),
77487751
_ => self.expected("comment", next_token)?,
77497752
}
77507753
} else {
@@ -7965,10 +7968,10 @@ impl<'a> Parser<'a> {
79657968

79667969
let comment = match (has_eq, value.token) {
79677970
(true, BorrowedToken::SingleQuotedString(s)) => {
7968-
Ok(Some(SqlOption::Comment(CommentDef::WithEq(s))))
7971+
Ok(Some(SqlOption::Comment(CommentDef::WithEq(s.into_owned()))))
79697972
}
79707973
(false, BorrowedToken::SingleQuotedString(s)) => {
7971-
Ok(Some(SqlOption::Comment(CommentDef::WithoutEq(s))))
7974+
Ok(Some(SqlOption::Comment(CommentDef::WithoutEq(s.into_owned()))))
79727975
}
79737976
(_, token) => self.expected(
79747977
"BorrowedToken::SingleQuotedString",
@@ -8014,8 +8017,8 @@ impl<'a> Parser<'a> {
80148017
let value = self.next_token();
80158018

80168019
let tablespace = match value.token {
8017-
BorrowedToken::Word(Word { value: name, .. })
8018-
| BorrowedToken::SingleQuotedString(name) => {
8020+
BorrowedToken::Word(Word { value: name, .. }) => {
8021+
let name = name.to_string();
80198022
let storage = match self.parse_keyword(Keyword::STORAGE) {
80208023
true => {
80218024
let _ = self.consume_token(&BorrowedToken::Eq);
@@ -8038,6 +8041,28 @@ impl<'a> Parser<'a> {
80388041
storage,
80398042
})))
80408043
}
8044+
BorrowedToken::SingleQuotedString(name) => {
8045+
let storage = match self.parse_keyword(Keyword::STORAGE) {
8046+
true => {
8047+
let _ = self.consume_token(&BorrowedToken::Eq);
8048+
let storage_token = self.next_token();
8049+
match &storage_token.token {
8050+
BorrowedToken::Word(w) => match w.value.to_uppercase().as_str() {
8051+
"DISK" => Some(StorageType::Disk),
8052+
"MEMORY" => Some(StorageType::Memory),
8053+
_ => self.expected("DISK or MEMORY", storage_token)?,
8054+
},
8055+
_ => self.expected("BorrowedToken::Word", storage_token)?,
8056+
}
8057+
}
8058+
false => None,
8059+
};
8060+
8061+
Ok(Some(SqlOption::TableSpace(TablespaceOption {
8062+
name: name.into_owned(),
8063+
storage,
8064+
})))
8065+
}
80418066
_ => {
80428067
return self.expected("BorrowedToken::Word", value)?;
80438068
}
@@ -8176,7 +8201,7 @@ impl<'a> Parser<'a> {
81768201
pub fn parse_comment_value(&self) -> Result<String, ParserError> {
81778202
let next_token = self.next_token();
81788203
let value = match next_token.token {
8179-
BorrowedToken::SingleQuotedString(str) => str,
8204+
BorrowedToken::SingleQuotedString(str) => str.into_owned(),
81808205
BorrowedToken::DollarQuotedString(str) => str.value,
81818206
_ => self.expected("string literal", next_token)?,
81828207
};
@@ -10381,8 +10406,8 @@ impl<'a> Parser<'a> {
1038110406
}
1038210407
Keyword::NULL => ok_value(Value::Null),
1038310408
Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style {
10384-
Some('"') => ok_value(Value::DoubleQuotedString(w.value)),
10385-
Some('\'') => ok_value(Value::SingleQuotedString(w.value)),
10409+
Some('"') => ok_value(Value::DoubleQuotedString(w.value.into_owned())),
10410+
Some('\'') => ok_value(Value::SingleQuotedString(w.value.into_owned())),
1038610411
_ => self.expected(
1038710412
"A value?",
1038810413
TokenWithSpan {
@@ -10484,11 +10509,18 @@ impl<'a> Parser<'a> {
1048410509

1048510510
fn maybe_concat_string_literal(&self, mut str: String) -> String {
1048610511
if self.dialect.supports_string_literal_concatenation() {
10487-
while let BorrowedToken::SingleQuotedString(ref s)
10488-
| BorrowedToken::DoubleQuotedString(ref s) = self.peek_token_ref().token
10489-
{
10490-
str.push_str(s.clone().as_str());
10491-
self.advance_token();
10512+
loop {
10513+
match &self.peek_token_ref().token {
10514+
BorrowedToken::SingleQuotedString(s) => {
10515+
str.push_str(s.as_ref());
10516+
self.advance_token();
10517+
}
10518+
BorrowedToken::DoubleQuotedString(s) => {
10519+
str.push_str(s);
10520+
self.advance_token();
10521+
}
10522+
_ => break,
10523+
}
1049210524
}
1049310525
}
1049410526
str
@@ -10584,8 +10616,8 @@ impl<'a> Parser<'a> {
1058410616
value,
1058510617
keyword: Keyword::NoKeyword,
1058610618
..
10587-
}) => Ok(value),
10588-
BorrowedToken::SingleQuotedString(s) => Ok(s),
10619+
}) => Ok(value.into_owned()),
10620+
BorrowedToken::SingleQuotedString(s) => Ok(s.into_owned()),
1058910621
BorrowedToken::DoubleQuotedString(s) => Ok(s),
1059010622
BorrowedToken::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
1059110623
Ok(s)
@@ -11100,7 +11132,7 @@ impl<'a> Parser<'a> {
1110011132
loop {
1110111133
let next_token = self.next_token();
1110211134
match next_token.token {
11103-
BorrowedToken::SingleQuotedString(value) => values.push(value),
11135+
BorrowedToken::SingleQuotedString(value) => values.push(value.into_owned()),
1110411136
_ => self.expected("a string", next_token)?,
1110511137
}
1110611138
let next_token = self.next_token();
@@ -12125,7 +12157,7 @@ impl<'a> Parser<'a> {
1212512157
match next_token.token {
1212612158
BorrowedToken::Word(w) => modifiers.push(w.to_string()),
1212712159
BorrowedToken::Number(n, _) => modifiers.push(n),
12128-
BorrowedToken::SingleQuotedString(s) => modifiers.push(s),
12160+
BorrowedToken::SingleQuotedString(s) => modifiers.push(s.into_owned()),
1212912161

1213012162
BorrowedToken::Comma => {
1213112163
continue;
@@ -13261,15 +13293,15 @@ impl<'a> Parser<'a> {
1326113293
if token2 == BorrowedToken::Period {
1326213294
match token1.token {
1326313295
BorrowedToken::Word(w) => {
13264-
schema_name = w.value;
13296+
schema_name = w.value.to_string();
1326513297
}
1326613298
_ => {
1326713299
return self.expected("Schema name", token1);
1326813300
}
1326913301
}
1327013302
match token3.token {
1327113303
BorrowedToken::Word(w) => {
13272-
table_name = w.value;
13304+
table_name = w.value.to_string();
1327313305
}
1327413306
_ => {
1327513307
return self.expected("Table name", token3);
@@ -13282,7 +13314,7 @@ impl<'a> Parser<'a> {
1328213314
} else {
1328313315
match token1.token {
1328413316
BorrowedToken::Word(w) => {
13285-
table_name = w.value;
13317+
table_name = w.value.to_string();
1328613318
}
1328713319
_ => {
1328813320
return self.expected("Table name", token1);
@@ -14408,7 +14440,9 @@ impl<'a> Parser<'a> {
1440814440
None => {
1440914441
let next_token = self.next_token();
1441014442
if let BorrowedToken::Word(w) = next_token.token {
14411-
Expr::Value(Value::Placeholder(w.value).with_span(next_token.span))
14443+
Expr::Value(
14444+
Value::Placeholder(w.value.into_owned()).with_span(next_token.span),
14445+
)
1441214446
} else {
1441314447
return parser_err!(
1441414448
"Expecting number or byte length e.g. 100M",
@@ -14962,7 +14996,7 @@ impl<'a> Parser<'a> {
1496214996
let r#type = self.parse_data_type()?;
1496314997
let path = if let BorrowedToken::SingleQuotedString(path) = self.peek_token().token {
1496414998
self.next_token();
14965-
Some(path)
14999+
Some(path.into_owned())
1496615000
} else {
1496715001
None
1496815002
};
@@ -16491,7 +16525,7 @@ impl<'a> Parser<'a> {
1649116525
let opt_ilike = if self.parse_keyword(Keyword::ILIKE) {
1649216526
let next_token = self.next_token();
1649316527
let pattern = match next_token.token {
16494-
BorrowedToken::SingleQuotedString(s) => s,
16528+
BorrowedToken::SingleQuotedString(s) => s.into_owned(),
1649516529
_ => return self.expected("ilike pattern", next_token),
1649616530
};
1649716531
Some(IlikeSelectItem { pattern })
@@ -17128,7 +17162,11 @@ impl<'a> Parser<'a> {
1712817162
(true, _) => BorrowedToken::RParen,
1712917163
(false, BorrowedToken::EOF) => BorrowedToken::EOF,
1713017164
(false, BorrowedToken::Word(w)) if end_kws.contains(&w.keyword) => {
17131-
BorrowedToken::Word(w)
17165+
BorrowedToken::Word(Word {
17166+
value: Cow::Owned(w.value.into_owned()),
17167+
quote_style: w.quote_style,
17168+
keyword: w.keyword,
17169+
})
1713217170
}
1713317171
(false, _) => BorrowedToken::SemiColon,
1713417172
};
@@ -18327,27 +18365,27 @@ impl<'a> Parser<'a> {
1832718365
self.expect_token(&BorrowedToken::Eq)?;
1832818366
match self.peek_token().token {
1832918367
BorrowedToken::SingleQuotedString(_) => Ok(KeyValueOption {
18330-
option_name: key.value.clone(),
18368+
option_name: key.value.to_string(),
1833118369
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
1833218370
}),
1833318371
BorrowedToken::Word(word)
1833418372
if word.keyword == Keyword::TRUE || word.keyword == Keyword::FALSE =>
1833518373
{
1833618374
Ok(KeyValueOption {
18337-
option_name: key.value.clone(),
18375+
option_name: key.value.to_string(),
1833818376
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
1833918377
})
1834018378
}
1834118379
BorrowedToken::Number(..) => Ok(KeyValueOption {
18342-
option_name: key.value.clone(),
18380+
option_name: key.value.to_string(),
1834318381
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
1834418382
}),
1834518383
BorrowedToken::Word(word) => {
1834618384
self.next_token();
1834718385
Ok(KeyValueOption {
18348-
option_name: key.value.clone(),
18386+
option_name: key.value.to_string(),
1834918387
option_value: KeyValueOptionKind::Single(Value::Placeholder(
18350-
word.value.clone(),
18388+
word.value.to_string(),
1835118389
)),
1835218390
})
1835318391
}
@@ -18365,12 +18403,12 @@ impl<'a> Parser<'a> {
1836518403
Some(values) => {
1836618404
let values = values.into_iter().map(|v| v.value).collect();
1836718405
Ok(KeyValueOption {
18368-
option_name: key.value.clone(),
18406+
option_name: key.value.to_string(),
1836918407
option_value: KeyValueOptionKind::Multi(values),
1837018408
})
1837118409
}
1837218410
None => Ok(KeyValueOption {
18373-
option_name: key.value.clone(),
18411+
option_name: key.value.to_string(),
1837418412
option_value: KeyValueOptionKind::KeyValueOptions(Box::new(
1837518413
self.parse_key_value_options(true, &[])?,
1837618414
)),
@@ -18405,11 +18443,11 @@ fn maybe_prefixed_expr(expr: Expr, prefix: Option<Ident>) -> Expr {
1840518443
}
1840618444
}
1840718445

18408-
impl Word {
18446+
impl Word<'_> {
1840918447
#[deprecated(since = "0.54.0", note = "please use `into_ident` instead")]
1841018448
pub fn to_ident(&self, span: Span) -> Ident {
1841118449
Ident {
18412-
value: self.value.clone(),
18450+
value: self.value.to_string(),
1841318451
quote_style: self.quote_style,
1841418452
span,
1841518453
}
@@ -18418,7 +18456,7 @@ impl Word {
1841818456
/// Convert this word into an [`Ident`] identifier
1841918457
pub fn into_ident(self, span: Span) -> Ident {
1842018458
Ident {
18421-
value: self.value,
18459+
value: self.value.into_owned(),
1842218460
quote_style: self.quote_style,
1842318461
span,
1842418462
}

0 commit comments

Comments
 (0)