From 1f05da7649a87e23c458fb4861fe82b40e5c4f98 Mon Sep 17 00:00:00 2001 From: Petee Date: Mon, 5 Dec 2022 11:14:28 +0100 Subject: [PATCH] Bug fixes, improvements, optimization & refactoring before parser generation (#288) * Fix bad float conformation error message + add coverage * Remove duplicit information from int/float conversion errors Both strconv.ParseInt and strconv.ParseFloat already report the input. The word invalid isn't always true - it could also just be out of range. The type is contained within the Parse* function being called. So as the extra context added nothing useful, I decided to remove it altogether, making things much easier for the generated parser. * Fix possible panic when optional groups are used within a capture If an optional group within a capture matches no input (either because of an unexpected token or EOF), obtaining partial AST can cause `setField` to be called with no tokens, causing a panic. Fixed by not doing anything. * Improve mismatch error message for negation & negative lookahead group It didn't properly escape the token, the error would be malformed if the token contained single quotes. It also used a custom error instead of `UnexpectedTokenError` for no good reason. * Improve the choice of error reporting after optional branches The last error in the added test explains the difference. Without this change, it would have been `1:15: unexpected token "(" (expected ";")`, which would be very confusing as '(' was actually allowed after the class name. After this change, it's `1:20: unexpected token ")" (expected )`, pointing out the real problem - that an identifier was expected after the comma. * Optimize string field accumulation to do fewer allocations * Optimize finding case insensitive tokens to be done at parser construction Optimize finding case insensitive tokens to be done at parser construction Doing it before each parsing was unnecessary. One could worry the results would be different if `ParseFromLexer` was called with a different lexer than used to build the grammar, but then case-insentive tokens would be the least of our problems - literal & reference would check agaist the wrong TokenType. * Avoid an allocation in newParseContext Not very significant for the reflective parser, more significant for the generated one. * Include explicit disjunction in the union node The generated parser would be the second place that would construct a temporary disjunction in the implementation, I think it might as well contain the disjunction directly. * Remove unused optional and repetition grammar nodes They could never be created and were private, so this certainly can't break anything. * Change the receiver name of lookaheadGroup from n to l It was n probably because I copied it from negation and assumed n was for node. Naming it l is more consistent with the convention for other nodes. * Keep track of the number of usages of a strct node * Allow specifying UnexpectedTokenError expected sequence as a string Co-authored-by: Peter Dolak Co-authored-by: Peter Dolak --- context.go | 15 ++++--- ebnf.go | 10 +---- error.go | 9 ++-- error_test.go | 19 +++++---- grammar.go | 7 +++- nodes.go | 110 ++++++++++++------------------------------------- parser.go | 43 ++++++++++--------- parser_test.go | 49 +++++++++++++++++++--- visit.go | 6 +-- 9 files changed, 128 insertions(+), 140 deletions(-) diff --git a/context.go b/context.go index 034f799e..c5397847 100644 --- a/context.go +++ b/context.go @@ -29,8 +29,8 @@ type parseContext struct { allowTrailing bool } -func newParseContext(lex *lexer.PeekingLexer, lookahead int, caseInsensitive map[lexer.TokenType]bool) *parseContext { - return &parseContext{ +func newParseContext(lex *lexer.PeekingLexer, lookahead int, caseInsensitive map[lexer.TokenType]bool) parseContext { + return parseContext{ PeekingLexer: *lex, caseInsensitive: caseInsensitive, lookahead: lookahead, @@ -90,11 +90,14 @@ func (p *parseContext) MaybeUpdateError(err error) { // Stop returns true if parsing should terminate after the given "branch" failed to match. // -// Additionally, "err" should be the branch error, if any. This will be tracked to -// aid in error reporting under the assumption that the deepest occurring error is more -// useful than errors further up. +// Additionally, track the deepest error in the branch - the deeper the error, the more useful it usually is. +// It could already be the deepest error in the branch (only if deeper than current parent context deepest), +// or it could be "err", the latest error on the branch (even if same depth; the lexer holds the position). func (p *parseContext) Stop(err error, branch *parseContext) bool { - if branch.PeekingLexer.Cursor() >= p.deepestErrorDepth { + if branch.deepestErrorDepth > p.deepestErrorDepth { + p.deepestError = branch.deepestError + p.deepestErrorDepth = branch.deepestErrorDepth + } else if branch.PeekingLexer.Cursor() >= p.deepestErrorDepth { p.deepestError = err p.deepestErrorDepth = maxInt(branch.PeekingLexer.Cursor(), branch.deepestErrorDepth) } diff --git a/ebnf.go b/ebnf.go index 1698f753..957e0440 100644 --- a/ebnf.go +++ b/ebnf.go @@ -62,7 +62,7 @@ func buildEBNF(root bool, n node, seen map[node]bool, p *ebnfp, outp *[]*ebnfp) p = &ebnfp{name: name} *outp = append(*outp, p) seen[n] = true - for i, next := range n.nodeMembers { + for i, next := range n.disjunction.nodes { if i > 0 { p.out += " | " } @@ -111,14 +111,6 @@ func buildEBNF(root bool, n node, seen map[node]bool, p *ebnfp, outp *[]*ebnfp) case *reference: p.out += "<" + strings.ToLower(n.identifier) + ">" - case *optional: - buildEBNF(false, n.node, seen, p, outp) - p.out += "?" - - case *repetition: - buildEBNF(false, n.node, seen, p, outp) - p.out += "*" - case *negation: p.out += "~" buildEBNF(false, n.node, seen, p, outp) diff --git a/error.go b/error.go index 33827380..bcd62a7d 100644 --- a/error.go +++ b/error.go @@ -42,15 +42,18 @@ func FormatError(err Error) string { // This is useful for composing parsers in order to detect when a sub-parser has terminated. type UnexpectedTokenError struct { Unexpected lexer.Token - at node + Expect string + expectNode node // Usable instead of Expect, delays creating the string representation until necessary } func (u *UnexpectedTokenError) Error() string { return FormatError(u) } func (u *UnexpectedTokenError) Message() string { // nolint: golint var expected string - if u.at != nil { - expected = fmt.Sprintf(" (expected %s)", u.at) + if u.expectNode != nil { + expected = fmt.Sprintf(" (expected %s)", u.expectNode) + } else if u.Expect != "" { + expected = fmt.Sprintf(" (expected %s)", u.Expect) } return fmt.Sprintf("unexpected token %q%s", u.Unexpected, expected) } diff --git a/error_test.go b/error_test.go index 71b62e97..c076363e 100644 --- a/error_test.go +++ b/error_test.go @@ -5,14 +5,16 @@ import ( "testing" require "github.com/alecthomas/assert/v2" + "github.com/alecthomas/participle/v2" "github.com/alecthomas/participle/v2/lexer" ) func TestErrorReporting(t *testing.T) { type cls struct { - Visibility string `@"public"?` - Class string `"class" @Ident` + Visibility string `@"public"?` + Class string `"class" @Ident` + Bases []string `('(' @Ident (',' @Ident)+ ')')?` } type union struct { Visibility string `@"public"?` @@ -27,17 +29,20 @@ func TestErrorReporting(t *testing.T) { } p := mustTestParser[grammar](t, participle.UseLookahead(5)) - ast, err := p.ParseString("", `public class A;`) + ast, err := p.ParseString("", `public class A(B, C); class D; public union A;`) require.NoError(t, err) require.Equal(t, &grammar{Decls: []*decl{ - {Class: &cls{Visibility: "public", Class: "A"}}, + {Class: &cls{Visibility: "public", Class: "A", Bases: []string{"B", "C"}}}, + {Class: &cls{Class: "D"}}, + {Union: &union{Visibility: "public", Union: "A"}}, }}, ast) - _, err = p.ParseString("", `public union A;`) - require.NoError(t, err) + _, err = p.ParseString("", `public struct Bar;`) require.EqualError(t, err, `1:8: unexpected token "struct" (expected "union" )`) _, err = p.ParseString("", `public class 1;`) - require.EqualError(t, err, `1:14: unexpected token "1" (expected )`) + require.EqualError(t, err, `1:14: unexpected token "1" (expected ("(" ("," )+ ")")?)`) + _, err = p.ParseString("", `public class A(B,C,);`) + require.EqualError(t, err, `1:20: unexpected token ")" (expected )`) } func TestErrorWrap(t *testing.T) { diff --git a/grammar.go b/grammar.go index d534f831..7eebee0d 100644 --- a/grammar.go +++ b/grammar.go @@ -30,7 +30,7 @@ func (g *generatorContext) addUnionDefs(defs []unionDef) error { } unionNode := &union{ unionDef: def, - nodeMembers: make([]node, 0, len(def.members)), + disjunction: disjunction{nodes: make([]node, 0, len(def.members))}, } g.typeNodes[def.typ], unionNodes[i] = unionNode, unionNode } @@ -41,7 +41,7 @@ func (g *generatorContext) addUnionDefs(defs []unionDef) error { if err != nil { return err } - unionNode.nodeMembers = append(unionNode.nodeMembers, memberNode) + unionNode.disjunction.nodes = append(unionNode.disjunction.nodes, memberNode) } } return nil @@ -61,6 +61,9 @@ func (g *generatorContext) addCustomDefs(defs []customDef) error { func (g *generatorContext) parseType(t reflect.Type) (_ node, returnedError error) { t = indirectType(t) if n, ok := g.typeNodes[t]; ok { + if s, ok := n.(*strct); ok { + s.usages++ + } return n, nil } if t.Implements(parseableType) { diff --git a/nodes.go b/nodes.go index cc2b3b1e..c291918b 100644 --- a/nodes.go +++ b/nodes.go @@ -97,7 +97,7 @@ func (c *custom) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.V // @@ (for a union) type union struct { unionDef - nodeMembers []node + disjunction disjunction } func (u *union) String() string { return ebnf(u) } @@ -105,8 +105,7 @@ func (u *union) GoString() string { return u.typ.Name() } func (u *union) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { defer ctx.printTrace(u)() - temp := disjunction{u.nodeMembers} - vals, err := temp.Parse(ctx, parent) + vals, err := u.disjunction.Parse(ctx, parent) if err != nil { return nil, err } @@ -123,11 +122,13 @@ type strct struct { tokensFieldIndex []int posFieldIndex []int endPosFieldIndex []int + usages int } func newStrct(typ reflect.Type) *strct { s := &strct{ - typ: typ, + typ: typ, + usages: 1, } field, ok := typ.FieldByName("Pos") if ok && field.Type == positionType { @@ -295,19 +296,18 @@ type lookaheadGroup struct { negative bool } -func (n *lookaheadGroup) String() string { return ebnf(n) } -func (n *lookaheadGroup) GoString() string { return "lookaheadGroup{}" } +func (l *lookaheadGroup) String() string { return ebnf(l) } +func (l *lookaheadGroup) GoString() string { return "lookaheadGroup{}" } -func (n *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { - defer ctx.printTrace(n)() +func (l *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { + defer ctx.printTrace(l)() // Create a branch to avoid advancing the parser as any match will be discarded branch := ctx.Branch() - out, err = n.expr.Parse(branch, parent) + out, err = l.expr.Parse(branch, parent) matchedLookahead := err == nil && out != nil - expectingMatch := !n.negative + expectingMatch := !l.negative if matchedLookahead != expectingMatch { - peek := ctx.Peek() - return nil, Errorf(peek.Pos, "unexpected '%s'", peek.Value) + return nil, &UnexpectedTokenError{Unexpected: *ctx.Peek()} } return []reflect.Value{}, nil // Empty match slice means a match, unlike nil } @@ -382,7 +382,7 @@ func (s *sequence) Parse(ctx *parseContext, parent reflect.Value) (out []reflect return nil, nil } token := ctx.Peek() - return out, &UnexpectedTokenError{Unexpected: *token, at: n} + return out, &UnexpectedTokenError{Unexpected: *token, expectNode: n} } // Special-case for when children return an empty match. // Appending an empty, non-nil slice to a nil slice returns a nil slice. @@ -440,72 +440,6 @@ func (r *reference) Parse(ctx *parseContext, parent reflect.Value) (out []reflec return []reflect.Value{reflect.ValueOf(token.Value)}, nil } -// [ ] -type optional struct { - node node -} - -func (o *optional) String() string { return ebnf(o) } -func (o *optional) GoString() string { return "optional{}" } - -func (o *optional) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { - defer ctx.printTrace(o)() - branch := ctx.Branch() - out, err = o.node.Parse(branch, parent) - if err != nil { - // Optional part failed to match. - if ctx.Stop(err, branch) { - return out, err - } - } else { - ctx.Accept(branch) - } - if out == nil { - out = []reflect.Value{} - } - return out, nil -} - -// { } -type repetition struct { - node node -} - -func (r *repetition) String() string { return ebnf(r) } -func (r *repetition) GoString() string { return "repetition{}" } - -// Parse a repetition. Once a repetition is encountered it will always match, so grammars -// should ensure that branches are differentiated prior to the repetition. -func (r *repetition) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { - defer ctx.printTrace(r)() - i := 0 - for ; i < MaxIterations; i++ { - branch := ctx.Branch() - v, err := r.node.Parse(branch, parent) - out = append(out, v...) - if err != nil { - // Optional part failed to match. - if ctx.Stop(err, branch) { - return out, err - } - break - } else { - ctx.Accept(branch) - } - if v == nil { - break - } - } - if i >= MaxIterations { - t := ctx.Peek() - return nil, Errorf(t.Pos, "too many iterations of %s (> %d)", r, MaxIterations) - } - if out == nil { - out = []reflect.Value{} - } - return out, nil -} - // Match a token literal exactly "..."[:]. type literal struct { s string @@ -556,7 +490,7 @@ func (n *negation) Parse(ctx *parseContext, parent reflect.Value) (out []reflect out, err = n.node.Parse(branch, parent) if out != nil && err == nil { // out being non-nil means that what we don't want is actually here, so we report nomatch - return nil, Errorf(notEOF.Pos, "unexpected '%s'", notEOF.Value) + return nil, &UnexpectedTokenError{Unexpected: *notEOF} } // Just give the next token @@ -591,7 +525,7 @@ func conform(t reflect.Type, values []reflect.Value) (out []reflect.Value, err e case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: n, err := strconv.ParseInt(v.String(), 0, sizeOfKind(kind)) if err != nil { - return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err) + return nil, err } v = reflect.New(t).Elem() v.SetInt(n) @@ -599,7 +533,7 @@ func conform(t reflect.Type, values []reflect.Value) (out []reflect.Value, err e case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: n, err := strconv.ParseUint(v.String(), 0, sizeOfKind(kind)) if err != nil { - return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err) + return nil, err } v = reflect.New(t).Elem() v.SetUint(n) @@ -610,7 +544,7 @@ func conform(t reflect.Type, values []reflect.Value) (out []reflect.Value, err e case reflect.Float32, reflect.Float64: n, err := strconv.ParseFloat(v.String(), sizeOfKind(kind)) if err != nil { - return nil, fmt.Errorf("invalid integer %q: %s", v.String(), err) + return nil, err } v = reflect.New(t).Elem() v.SetFloat(n) @@ -735,9 +669,14 @@ func setField(tokens []lexer.Token, strct reflect.Value, field structLexerField, if err != nil { return err } + if len(fieldValue) == 0 { + return nil + } + accumulated := f.String() for _, v := range fieldValue { - f.Set(reflect.ValueOf(f.String() + v.String()).Convert(f.Type())) + accumulated += v.String() } + f.SetString(accumulated) return nil } @@ -755,6 +694,9 @@ func setField(tokens []lexer.Token, strct reflect.Value, field structLexerField, if err != nil { return err } + if len(fieldValue) == 0 { + return nil // Nothing to capture, can happen when trying to get a partial parse tree + } fv := fieldValue[0] diff --git a/parser.go b/parser.go index 4952afd5..46927569 100644 --- a/parser.go +++ b/parser.go @@ -21,15 +21,16 @@ type customDef struct { } type parserOptions struct { - lex lexer.Definition - rootType reflect.Type - typeNodes map[reflect.Type]node - useLookahead int - caseInsensitive map[string]bool - mappers []mapperByToken - unionDefs []unionDef - customDefs []customDef - elide []string + lex lexer.Definition + rootType reflect.Type + typeNodes map[reflect.Type]node + useLookahead int + caseInsensitive map[string]bool + caseInsensitiveTokens map[lexer.TokenType]bool + mappers []mapperByToken + unionDefs []unionDef + customDefs []customDef + elide []string } // A Parser for a particular grammar and lexer. @@ -132,6 +133,7 @@ func Build[G any](options ...Option) (parser *Parser[G], err error) { } p.typeNodes = context.typeNodes p.typeNodes[p.rootType] = rootNode + p.setCaseInsensitiveTokens() return p, nil } @@ -162,22 +164,25 @@ func (p *Parser[G]) ParseFromLexer(lex *lexer.PeekingLexer, options ...ParseOpti if err != nil { return nil, err } - caseInsensitive := map[lexer.TokenType]bool{} - for sym, tt := range p.lex.Symbols() { - if p.caseInsensitive[sym] { - caseInsensitive[tt] = true - } - } - ctx := newParseContext(lex, p.useLookahead, caseInsensitive) + ctx := newParseContext(lex, p.useLookahead, p.caseInsensitiveTokens) defer func() { *lex = ctx.PeekingLexer }() for _, option := range options { - option(ctx) + option(&ctx) } // If the grammar implements Parseable, use it. if parseable, ok := any(v).(Parseable); ok { - return v, p.rootParseable(ctx, parseable) + return v, p.rootParseable(&ctx, parseable) + } + return v, p.parseOne(&ctx, parseNode, rv) +} + +func (p *Parser[G]) setCaseInsensitiveTokens() { + p.caseInsensitiveTokens = map[lexer.TokenType]bool{} + for sym, tt := range p.lex.Symbols() { + if p.caseInsensitive[sym] { + p.caseInsensitiveTokens[tt] = true + } } - return v, p.parseOne(ctx, parseNode, rv) } func (p *Parser[G]) parse(lex lexer.Lexer, options ...ParseOption) (v *G, err error) { diff --git a/parser_test.go b/parser_test.go index 64a031fc..d69af184 100644 --- a/parser_test.go +++ b/parser_test.go @@ -12,6 +12,7 @@ import ( "text/scanner" require "github.com/alecthomas/assert/v2" + "github.com/alecthomas/participle/v2" "github.com/alecthomas/participle/v2/lexer" ) @@ -200,8 +201,9 @@ func TestRepetitionAcrossFields(t *testing.T) { } func TestAccumulateString(t *testing.T) { + type customString string type testAccumulateString struct { - A string `@"."+` + A customString `@"."+` } parser := mustTestParser[testAccumulateString](t) @@ -741,13 +743,18 @@ func TestEmptyStructErrorsNotPanicsIssue21(t *testing.T) { func TestMultipleTokensIntoScalar(t *testing.T) { type grammar struct { - Field int `@("-" Int)` + Field int `@("-"? Int)` } p, err := participle.Build[grammar]() require.NoError(t, err) actual, err := p.ParseString("", `- 10`) require.NoError(t, err) require.Equal(t, -10, actual.Field) + + _, err = p.ParseString("", `x`) + require.EqualError(t, err, `1:1: unexpected token "x" (expected )`) + _, err = p.ParseString("", ` `) + require.EqualError(t, err, `1:2: unexpected token "" (expected )`) } type posMixin struct { @@ -1220,6 +1227,20 @@ func TestNegationWithDisjunction(t *testing.T) { require.Equal(t, &[]string{"hello", "world", ","}, ast.EverythingMoreComplex) } +func TestNegationLookaheadError(t *testing.T) { + type grammar struct { + Stuff []string `@Ident @!('.' | '#') @Ident` + } + p := mustTestParser[grammar](t) + + ast, err := p.ParseString("", `hello, world`) + require.NoError(t, err) + require.Equal(t, []string{"hello", ",", "world"}, ast.Stuff) + + _, err = p.ParseString("", `hello . world`) + require.EqualError(t, err, `1:7: unexpected token "."`) +} + func TestLookaheadGroup_Positive_SingleToken(t *testing.T) { type val struct { Str string ` @String` @@ -1247,10 +1268,10 @@ func TestLookaheadGroup_Positive_SingleToken(t *testing.T) { require.NoError(t, err) require.Equal(t, &sum{Left: val{Int: 1}, Ops: []op{{"*", val{Int: 2}}, {"*", val{Int: 3}}}}, ast) - _, err = p.ParseString("", `"a" * "x" + "b"`) - require.EqualError(t, err, `1:7: unexpected '"x"'`) + _, err = p.ParseString("", `"a" * 'x' + "b"`) + require.EqualError(t, err, `1:7: unexpected token "'x'"`) _, err = p.ParseString("", `4 * 2 + 0 * "b"`) - require.EqualError(t, err, `1:13: unexpected '"b"'`) + require.EqualError(t, err, `1:13: unexpected token "\"b\""`) } func TestLookaheadGroup_Negative_SingleToken(t *testing.T) { @@ -1864,3 +1885,21 @@ func TestIssue255(t *testing.T) { require.NoError(t, err) require.Equal(t, &I255Grammar{Union: &I255String{Value: `"Hello, World!"`}}, g) } + +func TestParseNumbers(t *testing.T) { + type grammar struct { + Int int8 `@('-'? Int)` + Uint uint16 `@('-'? Int)` + Float float64 `@Ident` + } + parser := participle.MustBuild[grammar]() + _, err := parser.ParseString("", `300 0 x`) + require.EqualError(t, err, `grammar.Int: strconv.ParseInt: parsing "300": value out of range`) + _, err = parser.ParseString("", `-2 -2 x`) + require.EqualError(t, err, `grammar.Uint: strconv.ParseUint: parsing "-2": invalid syntax`) + _, err = parser.ParseString("", `0 0 nope`) + require.EqualError(t, err, `grammar.Float: strconv.ParseFloat: parsing "nope": invalid syntax`) + result, err := parser.ParseString("", `-30 3000 Inf`) + require.NoError(t, err) + require.Equal(t, grammar{Int: -30, Uint: 3000, Float: math.Inf(1)}, *result) +} diff --git a/visit.go b/visit.go index e4186b18..8b46c13c 100644 --- a/visit.go +++ b/visit.go @@ -20,7 +20,7 @@ func visit(n node, visitor func(n node, next func() error) error) error { case *custom: return nil case *union: - for _, member := range n.nodeMembers { + for _, member := range n.disjunction.nodes { if err := visit(member, visitor); err != nil { return err } @@ -40,10 +40,6 @@ func visit(n node, visitor func(n node, next func() error) error) error { return visit(n.node, visitor) case *reference: return nil - case *optional: - return visit(n.node, visitor) - case *repetition: - return visit(n.node, visitor) case *negation: return visit(n.node, visitor) case *literal: