Fix gen lexer word boundary, case insensitive, and literal matching c…

…ases (plus conformance tests) (#274) * Allow gen conformance test to be run individually * Conformance test for word boundary \b * Fix case insensitive matching in gen lexer * Fix match literal at end of string in gen lexer * Fix word boundary case in gen lexer If \b is used at start of pattern, it could match right before the current position, rather than right after. Check both cases.
alecthomas · Oct 26, 2022 · fb225ea · fb225ea
1 parent 92cfb1a
commit fb225ea
Show file tree

Hide file tree

Showing 2 changed files with 138 additions and 40 deletions.
diff --git a/cmd/participle/gen_lexer_cmd.go b/cmd/participle/gen_lexer_cmd.go
@@ -188,10 +188,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 	// Fast-path a single literal.
 	if len(flattened) == 1 && re.Op == syntax.OpLiteral {
 		n := utf8.RuneCountInString(string(re.Rune))
-		if n == 1 {
-			fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
+		if re.Flags&syntax.FoldCase != 0 {
+			fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) {\n", n, n, string(re.Rune))
 		} else {
-			fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
+			if n == 1 {
+				fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
+			} else {
+				fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
+			}
 		}
 		fmt.Fprintf(w, "groups[0] = p\n")
 		fmt.Fprintf(w, "groups[1] = p + %d\n", n)
@@ -219,10 +223,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 
 		case syntax.OpLiteral: // matches Runes sequence
 			n := utf8.RuneCountInString(string(re.Rune))
-			if n == 1 {
-				fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
+			if re.Flags&syntax.FoldCase != 0 {
+				fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) { return p+%d }\n", n, n, string(re.Rune), n)
 			} else {
-				fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
+				if n == 1 {
+					fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
+				} else {
+					fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
+				}
 			}
 			fmt.Fprintf(w, "return -1\n")
 
@@ -284,11 +292,13 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 			syntax.OpBeginText, syntax.OpEndText,
 			syntax.OpBeginLine, syntax.OpEndLine:
 			fmt.Fprintf(w, "var l, u rune = -1, -1\n")
+			fmt.Fprintf(w, "var checkPrevChar = false\n")
 			fmt.Fprintf(w, "if p == 0 {\n")
 			decodeRune(w, "0", "u", "_")
 			fmt.Fprintf(w, "} else if p == len(s) {\n")
 			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s)\n")
 			fmt.Fprintf(w, "} else {\n")
+			fmt.Fprintf(w, "  checkPrevChar = true\n")
 			fmt.Fprintf(w, "  var ln int\n")
 			decodeRune(w, "p", "l", "ln")
 			fmt.Fprintf(w, "  if p+ln <= len(s) {\n")
@@ -305,6 +315,16 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 				syntax.OpEndLine:        "EmptyEndLine",
 			}
 			fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op])
+			// If this isn't the start or end of the string, we also have to check if we match
+			// the preceding character (zero length op could have matched right before)
+			fmt.Fprintf(w, "if checkPrevChar {\n")
+			// decode the character immediately previous to this one (conditional logic above
+			// guarantees that p is > 0)
+			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
+			decodeRune(w, "p", "u", "_")
+			fmt.Fprintf(w, "  op := syntax.EmptyOpContext(l, u)\n")
+			fmt.Fprintf(w, "  if op & syntax.%s != 0 { return p }\n", lut[re.Op])
+			fmt.Fprintf(w, "}\n")
 			fmt.Fprintf(w, "return -1\n")
 
 		case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name

diff --git a/lexer/internal/conformance/conformance_test.go b/lexer/internal/conformance/conformance_test.go
@@ -16,31 +16,55 @@ import (
 
 var conformanceLexer = lexer.MustStateful(lexer.Rules{
 	"Root": {
-		{"String", `"`, lexer.Push("String")},
-		// {"Heredoc", `<<(\w+)`, lexer.Push("Heredoc")},
+		{"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")},
+		{"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")},
+		{"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")},
+		{"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")},
 	},
-	"String": {
-		{"Escaped", `\\.`, nil},
-		{"StringEnd", `"`, lexer.Pop()},
+	"ExprTest": {
+		{"ExprString", `"`, lexer.Push("ExprString")},
+		// {"ExprHeredoc", `<<(\w+)`, lexer.Push("ExprHeredoc")},
+	},
+	"ExprString": {
+		{"ExprEscaped", `\\.`, nil},
+		{"ExprStringEnd", `"`, lexer.Pop()},
 		{"Expr", `\${`, lexer.Push("Expr")},
-		{"Char", `[^$"\\]+`, nil},
+		{"ExprChar", `[^$"\\]+`, nil},
 	},
 	"Expr": {
-		lexer.Include("Root"),
+		lexer.Include("ExprTest"),
 		{`Whitespace`, `\s+`, nil},
-		{`Oper`, `[-+/*%]`, nil},
-		{"Ident", `\w+`, lexer.Push("Reference")},
+		{`ExprOper`, `[-+/*%]`, nil},
+		{"Ident", `\w+`, lexer.Push("ExprReference")},
 		{"ExprEnd", `}`, lexer.Pop()},
 	},
-	"Reference": {
-		{"Dot", `\.`, nil},
+	"ExprReference": {
+		{"ExprDot", `\.`, nil},
 		{"Ident", `\w+`, nil},
 		lexer.Return(),
 	},
-	// "Heredoc": {
-	// 	{"End", `\1`, lexer.Pop()},
+	// "ExprHeredoc": {
+	// 	{"ExprHeredocEnd", `\1`, lexer.Pop()},
 	// 	lexer.Include("Expr"),
 	// },
+	"LiteralTest": {
+		{`LITOne`, `ONE`, nil},
+		{`LITKeyword`, `SELECT|FROM|WHERE|LIKE`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
+	"CaseInsensitiveTest": {
+		{`ABCWord`, `[aA][bB][cC]`, nil},
+		{`CIKeyword`, `(?i)(SELECT|from|WHERE|LIKE)`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
+	"WordBoundaryTest": {
+		{`WBKeyword`, `\b(?:abc|xyz)\b`, nil},
+		{"Slash", `/`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
 })
 
 type token struct {
@@ -55,43 +79,97 @@ func testLexer(t *testing.T, lex lexer.Definition) {
 		input    string
 		expected []token
 	}{
-		{"Push", `"${"Hello ${name + "!"}"}"`, []token{
-			{"String", "\""},
+		{"ExprPush", `EXPRTEST:"${"Hello ${name + "!"}"}"`, []token{
+			{"ExprString", "\""},
 			{"Expr", "${"},
-			{"String", "\""},
-			{"Char", "Hello "},
+			{"ExprString", "\""},
+			{"ExprChar", "Hello "},
 			{"Expr", "${"},
 			{"Ident", "name"},
 			{"Whitespace", " "},
-			{"Oper", "+"},
+			{"ExprOper", "+"},
 			{"Whitespace", " "},
-			{"String", "\""},
-			{"Char", "!"},
-			{"StringEnd", "\""},
+			{"ExprString", "\""},
+			{"ExprChar", "!"},
+			{"ExprStringEnd", "\""},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 		}},
-		{"Reference", `"${user.name}"`, []token{
-			{"String", "\""},
+		{"ExprReference", `EXPRTEST:"${user.name}"`, []token{
+			{"ExprString", "\""},
 			{"Expr", "${"},
 			{"Ident", "user"},
-			{"Dot", "."},
+			{"ExprDot", "."},
 			{"Ident", "name"},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 		}},
 		// TODO(alecthomas): Once backreferences are supported, this will work.
-		// 		{"Backref", `<<EOF
+		// 		{"Backref", `EXPRTEST:<<EOF
 		// heredoc
 		// EOF`, []token{
-		// 			{"Heredoc", "<<EOF"},
+		// 			{"ExprHeredoc", "<<EOF"},
 		// 			{"Whitespace", "\n"},
 		// 			{"Ident", "heredoc"},
 		// 			{"Whitespace", "\n"},
-		// 			{"End", "EOF"},
+		// 			{"ExprHeredocEnd", "EOF"},
 		// 		}},
+		{"CaseInsensitiveSimple", `CITEST:hello aBC world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"ABCWord", "aBC"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"CaseInsensitiveMixed", `CITEST:hello SeLeCt FROM world where END`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"CIKeyword", "SeLeCt"},
+			{"Whitespace", " "},
+			{"CIKeyword", "FROM"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+			{"Whitespace", " "},
+			{"CIKeyword", "where"},
+			{"Whitespace", " "},
+			{"Ident", "END"},
+		}},
+		{"OneLiteralAtEnd", `LITTEST:ONE`, []token{
+			{"LITOne", "ONE"},
+		}},
+		{"KeywordLiteralAtEnd", `LITTEST:SELECT`, []token{
+			{"LITKeyword", "SELECT"},
+		}},
+		{"LiteralMixed", `LITTEST:hello ONE test LIKE world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"LITOne", "ONE"},
+			{"Whitespace", " "},
+			{"Ident", "test"},
+			{"Whitespace", " "},
+			{"LITKeyword", "LIKE"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundarySlash", `WBTEST:xyz/hello world`, []token{
+			{"WBKeyword", "xyz"},
+			{"Slash", "/"},
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{
+			{"Ident", "abchello"},
+			{"Whitespace", " "},
+			{"WBKeyword", "xyz"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundaryStartEnd", `WBTEST:xyz`, []token{
+			{"WBKeyword", "xyz"},
+		}},
 	}
 	symbols := lexer.SymbolsByRune(lex)
 	for _, test := range tests {
@@ -100,12 +178,12 @@ func testLexer(t *testing.T, lex lexer.Definition) {
 			assert.NoError(t, err)
 			tokens, err := lexer.ConsumeAll(l)
 			assert.NoError(t, err)
-			actual := make([]token, len(tokens)-1)
+			actual := make([]token, 0, len(tokens))
 			for i, t := range tokens {
-				if t.Type == lexer.EOF {
+				if (i == 0 && strings.HasSuffix(t.Value, "TEST:")) || t.Type == lexer.EOF {
 					continue
 				}
-				actual[i] = token{Type: symbols[t.Type], Value: t.Value}
+				actual = append(actual, token{Type: symbols[t.Type], Value: t.Value})
 			}
 			assert.Equal(t, test.expected, actual)
 		})
@@ -117,7 +195,7 @@ func TestLexerConformanceGenerated(t *testing.T) {
 	args := []string{"test", "-run", "TestLexerConformanceGeneratedInternal", "-tags", "generated"}
 	// Propagate test flags.
 	flag.CommandLine.VisitAll(func(f *flag.Flag) {
-		if f.Value.String() != f.DefValue {
+		if f.Value.String() != f.DefValue && f.Name != "test.run" {
 			args = append(args, fmt.Sprintf("-%s=%s", f.Name, f.Value.String()))
 		}
 	})