Skip to content

Commit

Permalink
Fix gen lexer word boundary, case insensitive, and literal matching c…
Browse files Browse the repository at this point in the history
…ases (plus conformance tests) (#274)

* Allow gen conformance test to be run individually
* Conformance test for word boundary \b
* Fix case insensitive matching in gen lexer
* Fix match literal at end of string in gen lexer
* Fix word boundary case in gen lexer

If \b is used at start of pattern, it could match right before the
current position, rather than right after. Check both cases.
  • Loading branch information
klondikedragon committed Oct 26, 2022
1 parent 92cfb1a commit fb225ea
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 40 deletions.
32 changes: 26 additions & 6 deletions cmd/participle/gen_lexer_cmd.go
Expand Up @@ -188,10 +188,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
// Fast-path a single literal.
if len(flattened) == 1 && re.Op == syntax.OpLiteral {
n := utf8.RuneCountInString(string(re.Rune))
if n == 1 {
fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
if re.Flags&syntax.FoldCase != 0 {
fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) {\n", n, n, string(re.Rune))
} else {
fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
if n == 1 {
fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
} else {
fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
}
}
fmt.Fprintf(w, "groups[0] = p\n")
fmt.Fprintf(w, "groups[1] = p + %d\n", n)
Expand Down Expand Up @@ -219,10 +223,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {

case syntax.OpLiteral: // matches Runes sequence
n := utf8.RuneCountInString(string(re.Rune))
if n == 1 {
fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
if re.Flags&syntax.FoldCase != 0 {
fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) { return p+%d }\n", n, n, string(re.Rune), n)
} else {
fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
if n == 1 {
fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
} else {
fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
}
}
fmt.Fprintf(w, "return -1\n")

Expand Down Expand Up @@ -284,11 +292,13 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
syntax.OpBeginText, syntax.OpEndText,
syntax.OpBeginLine, syntax.OpEndLine:
fmt.Fprintf(w, "var l, u rune = -1, -1\n")
fmt.Fprintf(w, "var checkPrevChar = false\n")
fmt.Fprintf(w, "if p == 0 {\n")
decodeRune(w, "0", "u", "_")
fmt.Fprintf(w, "} else if p == len(s) {\n")
fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s)\n")
fmt.Fprintf(w, "} else {\n")
fmt.Fprintf(w, " checkPrevChar = true\n")
fmt.Fprintf(w, " var ln int\n")
decodeRune(w, "p", "l", "ln")
fmt.Fprintf(w, " if p+ln <= len(s) {\n")
Expand All @@ -305,6 +315,16 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
syntax.OpEndLine: "EmptyEndLine",
}
fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op])
// If this isn't the start or end of the string, we also have to check if we match
// the preceding character (zero length op could have matched right before)
fmt.Fprintf(w, "if checkPrevChar {\n")
// decode the character immediately previous to this one (conditional logic above
// guarantees that p is > 0)
fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
decodeRune(w, "p", "u", "_")
fmt.Fprintf(w, " op := syntax.EmptyOpContext(l, u)\n")
fmt.Fprintf(w, " if op & syntax.%s != 0 { return p }\n", lut[re.Op])
fmt.Fprintf(w, "}\n")
fmt.Fprintf(w, "return -1\n")

case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name
Expand Down
146 changes: 112 additions & 34 deletions lexer/internal/conformance/conformance_test.go
Expand Up @@ -16,31 +16,55 @@ import (

var conformanceLexer = lexer.MustStateful(lexer.Rules{
"Root": {
{"String", `"`, lexer.Push("String")},
// {"Heredoc", `<<(\w+)`, lexer.Push("Heredoc")},
{"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")},
{"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")},
{"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")},
{"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")},
},
"String": {
{"Escaped", `\\.`, nil},
{"StringEnd", `"`, lexer.Pop()},
"ExprTest": {
{"ExprString", `"`, lexer.Push("ExprString")},
// {"ExprHeredoc", `<<(\w+)`, lexer.Push("ExprHeredoc")},
},
"ExprString": {
{"ExprEscaped", `\\.`, nil},
{"ExprStringEnd", `"`, lexer.Pop()},
{"Expr", `\${`, lexer.Push("Expr")},
{"Char", `[^$"\\]+`, nil},
{"ExprChar", `[^$"\\]+`, nil},
},
"Expr": {
lexer.Include("Root"),
lexer.Include("ExprTest"),
{`Whitespace`, `\s+`, nil},
{`Oper`, `[-+/*%]`, nil},
{"Ident", `\w+`, lexer.Push("Reference")},
{`ExprOper`, `[-+/*%]`, nil},
{"Ident", `\w+`, lexer.Push("ExprReference")},
{"ExprEnd", `}`, lexer.Pop()},
},
"Reference": {
{"Dot", `\.`, nil},
"ExprReference": {
{"ExprDot", `\.`, nil},
{"Ident", `\w+`, nil},
lexer.Return(),
},
// "Heredoc": {
// {"End", `\1`, lexer.Pop()},
// "ExprHeredoc": {
// {"ExprHeredocEnd", `\1`, lexer.Pop()},
// lexer.Include("Expr"),
// },
"LiteralTest": {
{`LITOne`, `ONE`, nil},
{`LITKeyword`, `SELECT|FROM|WHERE|LIKE`, nil},
{"Ident", `\w+`, nil},
{"Whitespace", `\s+`, nil},
},
"CaseInsensitiveTest": {
{`ABCWord`, `[aA][bB][cC]`, nil},
{`CIKeyword`, `(?i)(SELECT|from|WHERE|LIKE)`, nil},
{"Ident", `\w+`, nil},
{"Whitespace", `\s+`, nil},
},
"WordBoundaryTest": {
{`WBKeyword`, `\b(?:abc|xyz)\b`, nil},
{"Slash", `/`, nil},
{"Ident", `\w+`, nil},
{"Whitespace", `\s+`, nil},
},
})

type token struct {
Expand All @@ -55,43 +79,97 @@ func testLexer(t *testing.T, lex lexer.Definition) {
input string
expected []token
}{
{"Push", `"${"Hello ${name + "!"}"}"`, []token{
{"String", "\""},
{"ExprPush", `EXPRTEST:"${"Hello ${name + "!"}"}"`, []token{
{"ExprString", "\""},
{"Expr", "${"},
{"String", "\""},
{"Char", "Hello "},
{"ExprString", "\""},
{"ExprChar", "Hello "},
{"Expr", "${"},
{"Ident", "name"},
{"Whitespace", " "},
{"Oper", "+"},
{"ExprOper", "+"},
{"Whitespace", " "},
{"String", "\""},
{"Char", "!"},
{"StringEnd", "\""},
{"ExprString", "\""},
{"ExprChar", "!"},
{"ExprStringEnd", "\""},
{"ExprEnd", "}"},
{"StringEnd", "\""},
{"ExprStringEnd", "\""},
{"ExprEnd", "}"},
{"StringEnd", "\""},
{"ExprStringEnd", "\""},
}},
{"Reference", `"${user.name}"`, []token{
{"String", "\""},
{"ExprReference", `EXPRTEST:"${user.name}"`, []token{
{"ExprString", "\""},
{"Expr", "${"},
{"Ident", "user"},
{"Dot", "."},
{"ExprDot", "."},
{"Ident", "name"},
{"ExprEnd", "}"},
{"StringEnd", "\""},
{"ExprStringEnd", "\""},
}},
// TODO(alecthomas): Once backreferences are supported, this will work.
// {"Backref", `<<EOF
// {"Backref", `EXPRTEST:<<EOF
// heredoc
// EOF`, []token{
// {"Heredoc", "<<EOF"},
// {"ExprHeredoc", "<<EOF"},
// {"Whitespace", "\n"},
// {"Ident", "heredoc"},
// {"Whitespace", "\n"},
// {"End", "EOF"},
// {"ExprHeredocEnd", "EOF"},
// }},
{"CaseInsensitiveSimple", `CITEST:hello aBC world`, []token{
{"Ident", "hello"},
{"Whitespace", " "},
{"ABCWord", "aBC"},
{"Whitespace", " "},
{"Ident", "world"},
}},
{"CaseInsensitiveMixed", `CITEST:hello SeLeCt FROM world where END`, []token{
{"Ident", "hello"},
{"Whitespace", " "},
{"CIKeyword", "SeLeCt"},
{"Whitespace", " "},
{"CIKeyword", "FROM"},
{"Whitespace", " "},
{"Ident", "world"},
{"Whitespace", " "},
{"CIKeyword", "where"},
{"Whitespace", " "},
{"Ident", "END"},
}},
{"OneLiteralAtEnd", `LITTEST:ONE`, []token{
{"LITOne", "ONE"},
}},
{"KeywordLiteralAtEnd", `LITTEST:SELECT`, []token{
{"LITKeyword", "SELECT"},
}},
{"LiteralMixed", `LITTEST:hello ONE test LIKE world`, []token{
{"Ident", "hello"},
{"Whitespace", " "},
{"LITOne", "ONE"},
{"Whitespace", " "},
{"Ident", "test"},
{"Whitespace", " "},
{"LITKeyword", "LIKE"},
{"Whitespace", " "},
{"Ident", "world"},
}},
{"WordBoundarySlash", `WBTEST:xyz/hello world`, []token{
{"WBKeyword", "xyz"},
{"Slash", "/"},
{"Ident", "hello"},
{"Whitespace", " "},
{"Ident", "world"},
}},
{"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{
{"Ident", "abchello"},
{"Whitespace", " "},
{"WBKeyword", "xyz"},
{"Whitespace", " "},
{"Ident", "world"},
}},
{"WordBoundaryStartEnd", `WBTEST:xyz`, []token{
{"WBKeyword", "xyz"},
}},
}
symbols := lexer.SymbolsByRune(lex)
for _, test := range tests {
Expand All @@ -100,12 +178,12 @@ func testLexer(t *testing.T, lex lexer.Definition) {
assert.NoError(t, err)
tokens, err := lexer.ConsumeAll(l)
assert.NoError(t, err)
actual := make([]token, len(tokens)-1)
actual := make([]token, 0, len(tokens))
for i, t := range tokens {
if t.Type == lexer.EOF {
if (i == 0 && strings.HasSuffix(t.Value, "TEST:")) || t.Type == lexer.EOF {
continue
}
actual[i] = token{Type: symbols[t.Type], Value: t.Value}
actual = append(actual, token{Type: symbols[t.Type], Value: t.Value})
}
assert.Equal(t, test.expected, actual)
})
Expand All @@ -117,7 +195,7 @@ func TestLexerConformanceGenerated(t *testing.T) {
args := []string{"test", "-run", "TestLexerConformanceGeneratedInternal", "-tags", "generated"}
// Propagate test flags.
flag.CommandLine.VisitAll(func(f *flag.Flag) {
if f.Value.String() != f.DefValue {
if f.Value.String() != f.DefValue && f.Name != "test.run" {
args = append(args, fmt.Sprintf("-%s=%s", f.Name, f.Value.String()))
}
})
Expand Down

0 comments on commit fb225ea

Please sign in to comment.