Bug fixes, improvements, optimization & refactoring before parser gen…

…eration (#288) * Fix bad float conformation error message + add coverage * Remove duplicit information from int/float conversion errors Both strconv.ParseInt and strconv.ParseFloat already report the input. The word invalid isn't always true - it could also just be out of range. The type is contained within the Parse* function being called. So as the extra context added nothing useful, I decided to remove it altogether, making things much easier for the generated parser. * Fix possible panic when optional groups are used within a capture If an optional group within a capture matches no input (either because of an unexpected token or EOF), obtaining partial AST can cause `setField` to be called with no tokens, causing a panic. Fixed by not doing anything. * Improve mismatch error message for negation & negative lookahead group It didn't properly escape the token, the error would be malformed if the token contained single quotes. It also used a custom error instead of `UnexpectedTokenError` for no good reason. * Improve the choice of error reporting after optional branches The last error in the added test explains the difference. Without this change, it would have been `1:15: unexpected token "(" (expected ";")`, which would be very confusing as '(' was actually allowed after the class name. After this change, it's `1:20: unexpected token ")" (expected <ident>)`, pointing out the real problem - that an identifier was expected after the comma. * Optimize string field accumulation to do fewer allocations * Optimize finding case insensitive tokens to be done at parser construction Optimize finding case insensitive tokens to be done at parser construction Doing it before each parsing was unnecessary. One could worry the results would be different if `ParseFromLexer` was called with a different lexer than used to build the grammar, but then case-insentive tokens would be the least of our problems - literal & reference would check agaist the wrong TokenType. * Avoid an allocation in newParseContext Not very significant for the reflective parser, more significant for the generated one. * Include explicit disjunction in the union node The generated parser would be the second place that would construct a temporary disjunction in the implementation, I think it might as well contain the disjunction directly. * Remove unused optional and repetition grammar nodes They could never be created and were private, so this certainly can't break anything. * Change the receiver name of lookaheadGroup from n to l It was n probably because I copied it from negation and assumed n was for node. Naming it l is more consistent with the convention for other nodes. * Keep track of the number of usages of a strct node * Allow specifying UnexpectedTokenError expected sequence as a string Co-authored-by: Peter Dolak <peter.dolak@exponea.com> Co-authored-by: Peter Dolak <peter.dolak@bloomreach.com>
alecthomas · Dec 5, 2022 · 1f05da7 · 1f05da7
1 parent 5adbb7c
commit 1f05da7
Show file tree

Hide file tree

Showing 9 changed files with 128 additions and 140 deletions.
diff --git a/context.go b/context.go
@@ -29,8 +29,8 @@ type parseContext struct {
 	allowTrailing     bool
 }
 
-func newParseContext(lex *lexer.PeekingLexer, lookahead int, caseInsensitive map[lexer.TokenType]bool) *parseContext {
-	return &parseContext{
+func newParseContext(lex *lexer.PeekingLexer, lookahead int, caseInsensitive map[lexer.TokenType]bool) parseContext {
+	return parseContext{
 		PeekingLexer:    *lex,
 		caseInsensitive: caseInsensitive,
 		lookahead:       lookahead,
@@ -90,11 +90,14 @@ func (p *parseContext) MaybeUpdateError(err error) {
 
 // Stop returns true if parsing should terminate after the given "branch" failed to match.
 //
-// Additionally, "err" should be the branch error, if any. This will be tracked to
-// aid in error reporting under the assumption that the deepest occurring error is more
-// useful than errors further up.
+// Additionally, track the deepest error in the branch - the deeper the error, the more useful it usually is.
+// It could already be the deepest error in the branch (only if deeper than current parent context deepest),
+// or it could be "err", the latest error on the branch (even if same depth; the lexer holds the position).
 func (p *parseContext) Stop(err error, branch *parseContext) bool {
-	if branch.PeekingLexer.Cursor() >= p.deepestErrorDepth {
+	if branch.deepestErrorDepth > p.deepestErrorDepth {
+		p.deepestError = branch.deepestError
+		p.deepestErrorDepth = branch.deepestErrorDepth
+	} else if branch.PeekingLexer.Cursor() >= p.deepestErrorDepth {
 		p.deepestError = err
 		p.deepestErrorDepth = maxInt(branch.PeekingLexer.Cursor(), branch.deepestErrorDepth)
 	}

diff --git a/ebnf.go b/ebnf.go
@@ -62,7 +62,7 @@ func buildEBNF(root bool, n node, seen map[node]bool, p *ebnfp, outp *[]*ebnfp)
 		p = &ebnfp{name: name}
 		*outp = append(*outp, p)
 		seen[n] = true
-		for i, next := range n.nodeMembers {
+		for i, next := range n.disjunction.nodes {
 			if i > 0 {
 				p.out += " | "
 			}
@@ -111,14 +111,6 @@ func buildEBNF(root bool, n node, seen map[node]bool, p *ebnfp, outp *[]*ebnfp)
 	case *reference:
 		p.out += "<" + strings.ToLower(n.identifier) + ">"
 
-	case *optional:
-		buildEBNF(false, n.node, seen, p, outp)
-		p.out += "?"
-
-	case *repetition:
-		buildEBNF(false, n.node, seen, p, outp)
-		p.out += "*"
-
 	case *negation:
 		p.out += "~"
 		buildEBNF(false, n.node, seen, p, outp)

diff --git a/error.go b/error.go
@@ -42,15 +42,18 @@ func FormatError(err Error) string {
 // This is useful for composing parsers in order to detect when a sub-parser has terminated.
 type UnexpectedTokenError struct {
 	Unexpected lexer.Token
-	at         node
+	Expect     string
+	expectNode node // Usable instead of Expect, delays creating the string representation until necessary
 }
 
 func (u *UnexpectedTokenError) Error() string { return FormatError(u) }
 
 func (u *UnexpectedTokenError) Message() string { // nolint: golint
 	var expected string
-	if u.at != nil {
-		expected = fmt.Sprintf(" (expected %s)", u.at)
+	if u.expectNode != nil {
+		expected = fmt.Sprintf(" (expected %s)", u.expectNode)
+	} else if u.Expect != "" {
+		expected = fmt.Sprintf(" (expected %s)", u.Expect)
 	}
 	return fmt.Sprintf("unexpected token %q%s", u.Unexpected, expected)
 }

diff --git a/error_test.go b/error_test.go
@@ -5,14 +5,16 @@ import (
 	"testing"
 
 	require "github.com/alecthomas/assert/v2"
+
 	"github.com/alecthomas/participle/v2"
 	"github.com/alecthomas/participle/v2/lexer"
 )
 
 func TestErrorReporting(t *testing.T) {
 	type cls struct {
-		Visibility string `@"public"?`
-		Class      string `"class" @Ident`
+		Visibility string   `@"public"?`
+		Class      string   `"class" @Ident`
+		Bases      []string `('(' @Ident (',' @Ident)+ ')')?`
 	}
 	type union struct {
 		Visibility string `@"public"?`
@@ -27,17 +29,20 @@ func TestErrorReporting(t *testing.T) {
 	}
 	p := mustTestParser[grammar](t, participle.UseLookahead(5))
 
-	ast, err := p.ParseString("", `public class A;`)
+	ast, err := p.ParseString("", `public class A(B, C); class D; public union A;`)
 	require.NoError(t, err)
 	require.Equal(t, &grammar{Decls: []*decl{
-		{Class: &cls{Visibility: "public", Class: "A"}},
+		{Class: &cls{Visibility: "public", Class: "A", Bases: []string{"B", "C"}}},
+		{Class: &cls{Class: "D"}},
+		{Union: &union{Visibility: "public", Union: "A"}},
 	}}, ast)
-	_, err = p.ParseString("", `public union A;`)
-	require.NoError(t, err)
+
 	_, err = p.ParseString("", `public struct Bar;`)
 	require.EqualError(t, err, `1:8: unexpected token "struct" (expected "union" <ident>)`)
 	_, err = p.ParseString("", `public class 1;`)
-	require.EqualError(t, err, `1:14: unexpected token "1" (expected <ident>)`)
+	require.EqualError(t, err, `1:14: unexpected token "1" (expected <ident> ("(" <ident> ("," <ident>)+ ")")?)`)
+	_, err = p.ParseString("", `public class A(B,C,);`)
+	require.EqualError(t, err, `1:20: unexpected token ")" (expected <ident>)`)
 }
 
 func TestErrorWrap(t *testing.T) {

diff --git a/grammar.go b/grammar.go
@@ -30,7 +30,7 @@ func (g *generatorContext) addUnionDefs(defs []unionDef) error {
 		}
 		unionNode := &union{
 			unionDef:    def,
-			nodeMembers: make([]node, 0, len(def.members)),
+			disjunction: disjunction{nodes: make([]node, 0, len(def.members))},
 		}
 		g.typeNodes[def.typ], unionNodes[i] = unionNode, unionNode
 	}
@@ -41,7 +41,7 @@ func (g *generatorContext) addUnionDefs(defs []unionDef) error {
 			if err != nil {
 				return err
 			}
-			unionNode.nodeMembers = append(unionNode.nodeMembers, memberNode)
+			unionNode.disjunction.nodes = append(unionNode.disjunction.nodes, memberNode)
 		}
 	}
 	return nil
@@ -61,6 +61,9 @@ func (g *generatorContext) addCustomDefs(defs []customDef) error {
 func (g *generatorContext) parseType(t reflect.Type) (_ node, returnedError error) {
 	t = indirectType(t)
 	if n, ok := g.typeNodes[t]; ok {
+		if s, ok := n.(*strct); ok {
+			s.usages++
+		}
 		return n, nil
 	}
 	if t.Implements(parseableType) {

diff --git a/nodes.go b/nodes.go
@@ -97,16 +97,15 @@ func (c *custom) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.V
 // @@ (for a union)
 type union struct {
 	unionDef
-	nodeMembers []node
+	disjunction disjunction
 }
 
 func (u *union) String() string   { return ebnf(u) }
 func (u *union) GoString() string { return u.typ.Name() }
 
 func (u *union) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	defer ctx.printTrace(u)()
-	temp := disjunction{u.nodeMembers}
-	vals, err := temp.Parse(ctx, parent)
+	vals, err := u.disjunction.Parse(ctx, parent)
 	if err != nil {
 		return nil, err
 	}
@@ -123,11 +122,13 @@ type strct struct {
 	tokensFieldIndex []int
 	posFieldIndex    []int
 	endPosFieldIndex []int
+	usages           int
 }
 
 func newStrct(typ reflect.Type) *strct {
 	s := &strct{
-		typ: typ,
+		typ:    typ,
+		usages: 1,
 	}
 	field, ok := typ.FieldByName("Pos")
 	if ok && field.Type == positionType {
@@ -295,19 +296,18 @@ type lookaheadGroup struct {
 	negative bool
 }
 
-func (n *lookaheadGroup) String() string   { return ebnf(n) }
-func (n *lookaheadGroup) GoString() string { return "lookaheadGroup{}" }
+func (l *lookaheadGroup) String() string   { return ebnf(l) }
+func (l *lookaheadGroup) GoString() string { return "lookaheadGroup{}" }
 
-func (n *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
-	defer ctx.printTrace(n)()
+func (l *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
+	defer ctx.printTrace(l)()
 	// Create a branch to avoid advancing the parser as any match will be discarded
 	branch := ctx.Branch()
-	out, err = n.expr.Parse(branch, parent)
+	out, err = l.expr.Parse(branch, parent)
 	matchedLookahead := err == nil && out != nil
-	expectingMatch := !n.negative
+	expectingMatch := !l.negative
 	if matchedLookahead != expectingMatch {
-		peek := ctx.Peek()
-		return nil, Errorf(peek.Pos, "unexpected '%s'", peek.Value)
+		return nil, &UnexpectedTokenError{Unexpected: *ctx.Peek()}
 	}
 	return []reflect.Value{}, nil // Empty match slice means a match, unlike nil
 }
@@ -382,7 +382,7 @@ func (s *sequence) Parse(ctx *parseContext, parent reflect.Value) (out []reflect
 				return nil, nil
 			}
 			token := ctx.Peek()
-			return out, &UnexpectedTokenError{Unexpected: *token, at: n}
+			return out, &UnexpectedTokenError{Unexpected: *token, expectNode: n}
 		}
 		// Special-case for when children return an empty match.
 		// Appending an empty, non-nil slice to a nil slice returns a nil slice.
@@ -440,72 +440,6 @@ func (r *reference) Parse(ctx *parseContext, parent reflect.Value) (out []reflec
 	return []reflect.Value{reflect.ValueOf(token.Value)}, nil
 }
 
-// [ <expr> ] <sequence>
-type optional struct {
-	node node
-}
-
-func (o *optional) String() string   { return ebnf(o) }
-func (o *optional) GoString() string { return "optional{}" }
-
-func (o *optional) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
-	defer ctx.printTrace(o)()
-	branch := ctx.Branch()
-	out, err = o.node.Parse(branch, parent)
-	if err != nil {
-		// Optional part failed to match.
-		if ctx.Stop(err, branch) {
-			return out, err
-		}
-	} else {
-		ctx.Accept(branch)
-	}
-	if out == nil {
-		out = []reflect.Value{}
-	}
-	return out, nil
-}
-
-// { <expr> } <sequence>
-type repetition struct {
-	node node
-}
-
-func (r *repetition) String() string   { return ebnf(r) }
-func (r *repetition) GoString() string { return "repetition{}" }
-
-// Parse a repetition. Once a repetition is encountered it will always match, so grammars
-// should ensure that branches are differentiated prior to the repetition.
-func (r *repetition) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
-	defer ctx.printTrace(r)()
-	i := 0
-	for ; i < MaxIterations; i++ {
-		branch := ctx.Branch()
-		v, err := r.node.Parse(branch, parent)
-		out = append(out, v...)
-		if err != nil {
-			// Optional part failed to match.
-			if ctx.Stop(err, branch) {
-				return out, err
-			}
-			break
-		} else {
-			ctx.Accept(branch)
-		}
-		if v == nil {
-			break
-		}
-	}
-	if i >= MaxIterations {
-		t := ctx.Peek()
-		return nil, Errorf(t.Pos, "too many iterations of %s (> %d)", r, MaxIterations)
-	}
-	if out == nil {
-		out = []reflect.Value{}
-	}
-	return out, nil
-}
-
 // Match a token literal exactly "..."[:<type>].
 type literal struct {
 	s  string
@@ -556,7 +490,7 @@ func (n *negation) Parse(ctx *parseContext, parent reflect.Value) (out []reflect
 	out, err = n.node.Parse(branch, parent)
 	if out != nil && err == nil {
 		// out being non-nil means that what we don't want is actually here, so we report nomatch
-		return nil, Errorf(notEOF.Pos, "unexpected '%s'", notEOF.Value)
+		return nil, &UnexpectedTokenError{Unexpected: *notEOF}
 	}
 
 	// Just give the next token
@@ -591,15 +525,15 @@ func conform(t reflect.Type, values []reflect.Value) (out []reflect.Value, err e
 		case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
 			n, err := strconv.ParseInt(v.String(), 0, sizeOfKind(kind))
 			if err != nil {
-				return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err)
+				return nil, err
 			}
 			v = reflect.New(t).Elem()
 			v.SetInt(n)
 
 		case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
 			n, err := strconv.ParseUint(v.String(), 0, sizeOfKind(kind))
 			if err != nil {
-				return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err)
+				return nil, err
 			}
 			v = reflect.New(t).Elem()
 			v.SetUint(n)
@@ -610,7 +544,7 @@ func conform(t reflect.Type, values []reflect.Value) (out []reflect.Value, err e
 		case reflect.Float32, reflect.Float64:
 			n, err := strconv.ParseFloat(v.String(), sizeOfKind(kind))
 			if err != nil {
-				return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err)
+				return nil, err
 			}
 			v = reflect.New(t).Elem()
 			v.SetFloat(n)
@@ -735,9 +669,14 @@ func setField(tokens []lexer.Token, strct reflect.Value, field structLexerField,
 		if err != nil {
 			return err
 		}
+		if len(fieldValue) == 0 {
+			return nil
+		}
+		accumulated := f.String()
 		for _, v := range fieldValue {
-			f.Set(reflect.ValueOf(f.String() + v.String()).Convert(f.Type()))
+			accumulated += v.String()
 		}
+		f.SetString(accumulated)
 		return nil
 	}
 
@@ -755,6 +694,9 @@ func setField(tokens []lexer.Token, strct reflect.Value, field structLexerField,
 	if err != nil {
 		return err
 	}
+	if len(fieldValue) == 0 {
+		return nil // Nothing to capture, can happen when trying to get a partial parse tree
+	}
 
 	fv := fieldValue[0]