From fb225ea38ddfee2cc1405c4ed0ad6933993db832 Mon Sep 17 00:00:00 2001
From: klondikedragon <88254524+klondikedragon@users.noreply.github.com>
Date: Wed, 26 Oct 2022 17:48:18 -0600
Subject: [PATCH] Fix gen lexer word boundary, case insensitive, and literal
 matching cases (plus conformance tests) (#274)

* Allow gen conformance test to be run individually
* Conformance test for word boundary \b
* Fix case insensitive matching in gen lexer
* Fix match literal at end of string in gen lexer
* Fix word boundary case in gen lexer

If \b is used at start of pattern, it could match right before the
current position, rather than right after. Check both cases.
---
 cmd/participle/gen_lexer_cmd.go               |  32 +++-
 .../internal/conformance/conformance_test.go  | 146 ++++++++++++++----
 2 files changed, 138 insertions(+), 40 deletions(-)

diff --git a/cmd/participle/gen_lexer_cmd.go b/cmd/participle/gen_lexer_cmd.go
index 368b3e2c..97f83647 100644
--- a/cmd/participle/gen_lexer_cmd.go
+++ b/cmd/participle/gen_lexer_cmd.go
@@ -188,10 +188,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 	// Fast-path a single literal.
 	if len(flattened) == 1 && re.Op == syntax.OpLiteral {
 		n := utf8.RuneCountInString(string(re.Rune))
-		if n == 1 {
-			fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
+		if re.Flags&syntax.FoldCase != 0 {
+			fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) {\n", n, n, string(re.Rune))
 		} else {
-			fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
+			if n == 1 {
+				fmt.Fprintf(w, "if p < len(s) && s[p] == %q {\n", re.Rune[0])
+			} else {
+				fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q {\n", n, n, string(re.Rune))
+			}
 		}
 		fmt.Fprintf(w, "groups[0] = p\n")
 		fmt.Fprintf(w, "groups[1] = p + %d\n", n)
@@ -219,10 +223,14 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 
 		case syntax.OpLiteral: // matches Runes sequence
 			n := utf8.RuneCountInString(string(re.Rune))
-			if n == 1 {
-				fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
+			if re.Flags&syntax.FoldCase != 0 {
+				fmt.Fprintf(w, "if p+%d <= len(s) && strings.EqualFold(s[p:p+%d], %q) { return p+%d }\n", n, n, string(re.Rune), n)
 			} else {
-				fmt.Fprintf(w, "if p+%d < len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
+				if n == 1 {
+					fmt.Fprintf(w, "if p < len(s) && s[p] == %q { return p+1 }\n", re.Rune[0])
+				} else {
+					fmt.Fprintf(w, "if p+%d <= len(s) && s[p:p+%d] == %q { return p+%d }\n", n, n, string(re.Rune), n)
+				}
 			}
 			fmt.Fprintf(w, "return -1\n")
 
@@ -284,11 +292,13 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 			syntax.OpBeginText, syntax.OpEndText,
 			syntax.OpBeginLine, syntax.OpEndLine:
 			fmt.Fprintf(w, "var l, u rune = -1, -1\n")
+			fmt.Fprintf(w, "var checkPrevChar = false\n")
 			fmt.Fprintf(w, "if p == 0 {\n")
 			decodeRune(w, "0", "u", "_")
 			fmt.Fprintf(w, "} else if p == len(s) {\n")
 			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s)\n")
 			fmt.Fprintf(w, "} else {\n")
+			fmt.Fprintf(w, "  checkPrevChar = true\n")
 			fmt.Fprintf(w, "  var ln int\n")
 			decodeRune(w, "p", "l", "ln")
 			fmt.Fprintf(w, "  if p+ln <= len(s) {\n")
@@ -305,6 +315,16 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 				syntax.OpEndLine:        "EmptyEndLine",
 			}
 			fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op])
+			// If this isn't the start or end of the string, we also have to check if we match
+			// the preceding character (zero length op could have matched right before)
+			fmt.Fprintf(w, "if checkPrevChar {\n")
+			// decode the character immediately previous to this one (conditional logic above
+			// guarantees that p is > 0)
+			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
+			decodeRune(w, "p", "u", "_")
+			fmt.Fprintf(w, "  op := syntax.EmptyOpContext(l, u)\n")
+			fmt.Fprintf(w, "  if op & syntax.%s != 0 { return p }\n", lut[re.Op])
+			fmt.Fprintf(w, "}\n")
 			fmt.Fprintf(w, "return -1\n")
 
 		case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name
diff --git a/lexer/internal/conformance/conformance_test.go b/lexer/internal/conformance/conformance_test.go
index aa8dc66c..c39a7521 100644
--- a/lexer/internal/conformance/conformance_test.go
+++ b/lexer/internal/conformance/conformance_test.go
@@ -16,31 +16,55 @@ import (
 
 var conformanceLexer = lexer.MustStateful(lexer.Rules{
 	"Root": {
-		{"String", `"`, lexer.Push("String")},
-		// {"Heredoc", `<<(\w+)`, lexer.Push("Heredoc")},
+		{"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")},
+		{"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")},
+		{"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")},
+		{"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")},
 	},
-	"String": {
-		{"Escaped", `\\.`, nil},
-		{"StringEnd", `"`, lexer.Pop()},
+	"ExprTest": {
+		{"ExprString", `"`, lexer.Push("ExprString")},
+		// {"ExprHeredoc", `<<(\w+)`, lexer.Push("ExprHeredoc")},
+	},
+	"ExprString": {
+		{"ExprEscaped", `\\.`, nil},
+		{"ExprStringEnd", `"`, lexer.Pop()},
 		{"Expr", `\${`, lexer.Push("Expr")},
-		{"Char", `[^$"\\]+`, nil},
+		{"ExprChar", `[^$"\\]+`, nil},
 	},
 	"Expr": {
-		lexer.Include("Root"),
+		lexer.Include("ExprTest"),
 		{`Whitespace`, `\s+`, nil},
-		{`Oper`, `[-+/*%]`, nil},
-		{"Ident", `\w+`, lexer.Push("Reference")},
+		{`ExprOper`, `[-+/*%]`, nil},
+		{"Ident", `\w+`, lexer.Push("ExprReference")},
 		{"ExprEnd", `}`, lexer.Pop()},
 	},
-	"Reference": {
-		{"Dot", `\.`, nil},
+	"ExprReference": {
+		{"ExprDot", `\.`, nil},
 		{"Ident", `\w+`, nil},
 		lexer.Return(),
 	},
-	// "Heredoc": {
-	// 	{"End", `\1`, lexer.Pop()},
+	// "ExprHeredoc": {
+	// 	{"ExprHeredocEnd", `\1`, lexer.Pop()},
 	// 	lexer.Include("Expr"),
 	// },
+	"LiteralTest": {
+		{`LITOne`, `ONE`, nil},
+		{`LITKeyword`, `SELECT|FROM|WHERE|LIKE`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
+	"CaseInsensitiveTest": {
+		{`ABCWord`, `[aA][bB][cC]`, nil},
+		{`CIKeyword`, `(?i)(SELECT|from|WHERE|LIKE)`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
+	"WordBoundaryTest": {
+		{`WBKeyword`, `\b(?:abc|xyz)\b`, nil},
+		{"Slash", `/`, nil},
+		{"Ident", `\w+`, nil},
+		{"Whitespace", `\s+`, nil},
+	},
 })
 
 type token struct {
@@ -55,43 +79,97 @@ func testLexer(t *testing.T, lex lexer.Definition) {
 		input    string
 		expected []token
 	}{
-		{"Push", `"${"Hello ${name + "!"}"}"`, []token{
-			{"String", "\""},
+		{"ExprPush", `EXPRTEST:"${"Hello ${name + "!"}"}"`, []token{
+			{"ExprString", "\""},
 			{"Expr", "${"},
-			{"String", "\""},
-			{"Char", "Hello "},
+			{"ExprString", "\""},
+			{"ExprChar", "Hello "},
 			{"Expr", "${"},
 			{"Ident", "name"},
 			{"Whitespace", " "},
-			{"Oper", "+"},
+			{"ExprOper", "+"},
 			{"Whitespace", " "},
-			{"String", "\""},
-			{"Char", "!"},
-			{"StringEnd", "\""},
+			{"ExprString", "\""},
+			{"ExprChar", "!"},
+			{"ExprStringEnd", "\""},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 		}},
-		{"Reference", `"${user.name}"`, []token{
-			{"String", "\""},
+		{"ExprReference", `EXPRTEST:"${user.name}"`, []token{
+			{"ExprString", "\""},
 			{"Expr", "${"},
 			{"Ident", "user"},
-			{"Dot", "."},
+			{"ExprDot", "."},
 			{"Ident", "name"},
 			{"ExprEnd", "}"},
-			{"StringEnd", "\""},
+			{"ExprStringEnd", "\""},
 		}},
 		// TODO(alecthomas): Once backreferences are supported, this will work.
-		// 		{"Backref", `<<EOF
+		// 		{"Backref", `EXPRTEST:<<EOF
 		// heredoc
 		// EOF`, []token{
-		// 			{"Heredoc", "<<EOF"},
+		// 			{"ExprHeredoc", "<<EOF"},
 		// 			{"Whitespace", "\n"},
 		// 			{"Ident", "heredoc"},
 		// 			{"Whitespace", "\n"},
-		// 			{"End", "EOF"},
+		// 			{"ExprHeredocEnd", "EOF"},
 		// 		}},
+		{"CaseInsensitiveSimple", `CITEST:hello aBC world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"ABCWord", "aBC"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"CaseInsensitiveMixed", `CITEST:hello SeLeCt FROM world where END`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"CIKeyword", "SeLeCt"},
+			{"Whitespace", " "},
+			{"CIKeyword", "FROM"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+			{"Whitespace", " "},
+			{"CIKeyword", "where"},
+			{"Whitespace", " "},
+			{"Ident", "END"},
+		}},
+		{"OneLiteralAtEnd", `LITTEST:ONE`, []token{
+			{"LITOne", "ONE"},
+		}},
+		{"KeywordLiteralAtEnd", `LITTEST:SELECT`, []token{
+			{"LITKeyword", "SELECT"},
+		}},
+		{"LiteralMixed", `LITTEST:hello ONE test LIKE world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"LITOne", "ONE"},
+			{"Whitespace", " "},
+			{"Ident", "test"},
+			{"Whitespace", " "},
+			{"LITKeyword", "LIKE"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundarySlash", `WBTEST:xyz/hello world`, []token{
+			{"WBKeyword", "xyz"},
+			{"Slash", "/"},
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{
+			{"Ident", "abchello"},
+			{"Whitespace", " "},
+			{"WBKeyword", "xyz"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundaryStartEnd", `WBTEST:xyz`, []token{
+			{"WBKeyword", "xyz"},
+		}},
 	}
 	symbols := lexer.SymbolsByRune(lex)
 	for _, test := range tests {
@@ -100,12 +178,12 @@ func testLexer(t *testing.T, lex lexer.Definition) {
 			assert.NoError(t, err)
 			tokens, err := lexer.ConsumeAll(l)
 			assert.NoError(t, err)
-			actual := make([]token, len(tokens)-1)
+			actual := make([]token, 0, len(tokens))
 			for i, t := range tokens {
-				if t.Type == lexer.EOF {
+				if (i == 0 && strings.HasSuffix(t.Value, "TEST:")) || t.Type == lexer.EOF {
 					continue
 				}
-				actual[i] = token{Type: symbols[t.Type], Value: t.Value}
+				actual = append(actual, token{Type: symbols[t.Type], Value: t.Value})
 			}
 			assert.Equal(t, test.expected, actual)
 		})
@@ -117,7 +195,7 @@ func TestLexerConformanceGenerated(t *testing.T) {
 	args := []string{"test", "-run", "TestLexerConformanceGeneratedInternal", "-tags", "generated"}
 	// Propagate test flags.
 	flag.CommandLine.VisitAll(func(f *flag.Flag) {
-		if f.Value.String() != f.DefValue {
+		if f.Value.String() != f.DefValue && f.Name != "test.run" {
 			args = append(args, fmt.Sprintf("-%s=%s", f.Name, f.Value.String()))
 		}
 	})