From fb225ea38ddfee2cc1405c4ed0ad6933993db832 Mon Sep 17 00:00:00 2001 From: klondikedragon <88254524+klondikedragon@users.noreply.github.com> Date: Wed, 26 Oct 2022 17:48:18 -0600 Subject: [PATCH] Fix gen lexer word boundary, case insensitive, and literal matching cases (plus conformance tests) (#274) * Allow gen conformance test to be run individually * Conformance test for word boundary \b * Fix case insensitive matching in gen lexer * Fix match literal at end of string in gen lexer * Fix word boundary case in gen lexer If \b is used at start of pattern, it could match right before the current position, rather than right after. Check both cases. --- cmd/participle/gen_lexer_cmd.go | 32 +++- .../internal/conformance/conformance_test.go | 146 ++++++++++++++---- 2 files changed, 138 insertions(+), 40 deletions(-) diff --git a/cmd/participle/gen_lexer_cmd.go b/cmd/participle/gen_lexer_cmd.go index 368b3e2c..97f83647 100644 --- a/cmd/participle/gen_lexer_cmd.go +++ b/cmd/participle/gen_lexer_cmd.go @@ -188,10 +188,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { // Fast-path a single literal. if len(flattened) == 1 && re.Op == syntax.OpLiteral { n := utf8.RuneCountInString(string(re.Rune)) - if n == 1 { - fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0]) + if re.Flags&syntax.FoldCase != 0 { + fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) {\n", n, n, string(re.Rune)) } else { - fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune)) + if n == 1 { + fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0]) + } else { + fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune)) + } } fmt.Fprintf(w, "groups[0] = p\n") fmt.Fprintf(w, "groups[1] = p + %d\n", n) @@ -219,10 +223,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { case syntax.OpLiteral: // matches Runes sequence n := utf8.RuneCountInString(string(re.Rune)) - if n == 1 { - fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0]) + if re.Flags&syntax.FoldCase != 0 { + fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) { return p+%d }\n", n, n, string(re.Rune), n) } else { - fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n) + if n == 1 { + fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0]) + } else { + fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n) + } } fmt.Fprintf(w, "return -1\n") @@ -284,11 +292,13 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { syntax.OpBeginText, syntax.OpEndText, syntax.OpBeginLine, syntax.OpEndLine: fmt.Fprintf(w, "var l, u rune = -1, -1\n") + fmt.Fprintf(w, "var checkPrevChar = false\n") fmt.Fprintf(w, "if p == 0 {\n") decodeRune(w, "0", "u", "_") fmt.Fprintf(w, "} else if p == len(s) {\n") fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s)\n") fmt.Fprintf(w, "} else {\n") + fmt.Fprintf(w, " checkPrevChar = true\n") fmt.Fprintf(w, " var ln int\n") decodeRune(w, "p", "l", "ln") fmt.Fprintf(w, " if p+ln <= len(s) {\n") @@ -305,6 +315,16 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { syntax.OpEndLine: "EmptyEndLine", } fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op]) + // If this isn't the start or end of the string, we also have to check if we match + // the preceding character (zero length op could have matched right before) + fmt.Fprintf(w, "if checkPrevChar {\n") + // decode the character immediately previous to this one (conditional logic above + // guarantees that p is > 0) + fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n") + decodeRune(w, "p", "u", "_") + fmt.Fprintf(w, " op := syntax.EmptyOpContext(l, u)\n") + fmt.Fprintf(w, " if op & syntax.%s != 0 { return p }\n", lut[re.Op]) + fmt.Fprintf(w, "}\n") fmt.Fprintf(w, "return -1\n") case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name diff --git a/lexer/internal/conformance/conformance_test.go b/lexer/internal/conformance/conformance_test.go index aa8dc66c..c39a7521 100644 --- a/lexer/internal/conformance/conformance_test.go +++ b/lexer/internal/conformance/conformance_test.go @@ -16,31 +16,55 @@ import ( var conformanceLexer = lexer.MustStateful(lexer.Rules{ "Root": { - {"String", `"`, lexer.Push("String")}, - // {"Heredoc", `<<(\w+)`, lexer.Push("Heredoc")}, + {"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")}, + {"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")}, + {"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")}, + {"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")}, }, - "String": { - {"Escaped", `\\.`, nil}, - {"StringEnd", `"`, lexer.Pop()}, + "ExprTest": { + {"ExprString", `"`, lexer.Push("ExprString")}, + // {"ExprHeredoc", `<<(\w+)`, lexer.Push("ExprHeredoc")}, + }, + "ExprString": { + {"ExprEscaped", `\\.`, nil}, + {"ExprStringEnd", `"`, lexer.Pop()}, {"Expr", `\${`, lexer.Push("Expr")}, - {"Char", `[^$"\\]+`, nil}, + {"ExprChar", `[^$"\\]+`, nil}, }, "Expr": { - lexer.Include("Root"), + lexer.Include("ExprTest"), {`Whitespace`, `\s+`, nil}, - {`Oper`, `[-+/*%]`, nil}, - {"Ident", `\w+`, lexer.Push("Reference")}, + {`ExprOper`, `[-+/*%]`, nil}, + {"Ident", `\w+`, lexer.Push("ExprReference")}, {"ExprEnd", `}`, lexer.Pop()}, }, - "Reference": { - {"Dot", `\.`, nil}, + "ExprReference": { + {"ExprDot", `\.`, nil}, {"Ident", `\w+`, nil}, lexer.Return(), }, - // "Heredoc": { - // {"End", `\1`, lexer.Pop()}, + // "ExprHeredoc": { + // {"ExprHeredocEnd", `\1`, lexer.Pop()}, // lexer.Include("Expr"), // }, + "LiteralTest": { + {`LITOne`, `ONE`, nil}, + {`LITKeyword`, `SELECT|FROM|WHERE|LIKE`, nil}, + {"Ident", `\w+`, nil}, + {"Whitespace", `\s+`, nil}, + }, + "CaseInsensitiveTest": { + {`ABCWord`, `[aA][bB][cC]`, nil}, + {`CIKeyword`, `(?i)(SELECT|from|WHERE|LIKE)`, nil}, + {"Ident", `\w+`, nil}, + {"Whitespace", `\s+`, nil}, + }, + "WordBoundaryTest": { + {`WBKeyword`, `\b(?:abc|xyz)\b`, nil}, + {"Slash", `/`, nil}, + {"Ident", `\w+`, nil}, + {"Whitespace", `\s+`, nil}, + }, }) type token struct { @@ -55,43 +79,97 @@ func testLexer(t *testing.T, lex lexer.Definition) { input string expected []token }{ - {"Push", `"${"Hello ${name + "!"}"}"`, []token{ - {"String", "\""}, + {"ExprPush", `EXPRTEST:"${"Hello ${name + "!"}"}"`, []token{ + {"ExprString", "\""}, {"Expr", "${"}, - {"String", "\""}, - {"Char", "Hello "}, + {"ExprString", "\""}, + {"ExprChar", "Hello "}, {"Expr", "${"}, {"Ident", "name"}, {"Whitespace", " "}, - {"Oper", "+"}, + {"ExprOper", "+"}, {"Whitespace", " "}, - {"String", "\""}, - {"Char", "!"}, - {"StringEnd", "\""}, + {"ExprString", "\""}, + {"ExprChar", "!"}, + {"ExprStringEnd", "\""}, {"ExprEnd", "}"}, - {"StringEnd", "\""}, + {"ExprStringEnd", "\""}, {"ExprEnd", "}"}, - {"StringEnd", "\""}, + {"ExprStringEnd", "\""}, }}, - {"Reference", `"${user.name}"`, []token{ - {"String", "\""}, + {"ExprReference", `EXPRTEST:"${user.name}"`, []token{ + {"ExprString", "\""}, {"Expr", "${"}, {"Ident", "user"}, - {"Dot", "."}, + {"ExprDot", "."}, {"Ident", "name"}, {"ExprEnd", "}"}, - {"StringEnd", "\""}, + {"ExprStringEnd", "\""}, }}, // TODO(alecthomas): Once backreferences are supported, this will work. - // {"Backref", `<