From f9c3ae43257286625f88c66c1225d0056edc178b Mon Sep 17 00:00:00 2001 From: Alec Thomas Date: Sun, 20 Sep 2020 15:50:25 +1000 Subject: [PATCH] Support capturing all tokens into the AST. This includes tokens elided by Elide(), but not tokens elided by the Lexer. See #108. --- v2/README.md | 2 ++ v2/lexer/peek.go | 45 +++++++++++++++++++++++++-------- v2/lexer/peek_test.go | 5 +++- v2/map.go | 7 ++--- v2/nodes.go | 59 +++++++++++++++++++++++++------------------ v2/parser.go | 22 +++++++++++++--- v2/parser_test.go | 37 +++++++++++++++++++++++++++ 7 files changed, 134 insertions(+), 43 deletions(-) diff --git a/v2/README.md b/v2/README.md index 6dc5bb5d..e1664f8f 100644 --- a/v2/README.md +++ b/v2/README.md @@ -370,6 +370,8 @@ There are a few areas where Participle can provide useful feedback to users of y populated from the nearest matching token. 4. Any node in the AST containing a field `EndPos lexer.Position` will be automatically populated from the token at the end of the node. +5. Any node in the AST containing a field `Tokens []lexer.Token` will be automatically + populated with _all_ tokens captured by the node, _including_ elided tokens. These related pieces of information can be combined to provide fairly comprehensive error reporting. diff --git a/v2/lexer/peek.go b/v2/lexer/peek.go index bccf9632..b5cfdd89 100644 --- a/v2/lexer/peek.go +++ b/v2/lexer/peek.go @@ -5,13 +5,21 @@ type PeekingLexer struct { cursor int eof Token tokens []Token + elide map[rune]bool } var _ Lexer = &PeekingLexer{} // Upgrade a Lexer to a PeekingLexer with arbitrary lookahead. -func Upgrade(lex Lexer) (*PeekingLexer, error) { - r := &PeekingLexer{} +// +// "elide" is a slice of token types to elide from processing. +func Upgrade(lex Lexer, elide ...rune) (*PeekingLexer, error) { + r := &PeekingLexer{ + elide: make(map[rune]bool, len(elide)), + } + for _, rn := range elide { + r.elide[rn] = true + } for { t, err := lex.Next() if err != nil { @@ -26,27 +34,42 @@ func Upgrade(lex Lexer) (*PeekingLexer, error) { return r, nil } -// Cursor position in tokens. +// Range returns the slice of tokens between the two cursor points. +func (p *PeekingLexer) Range(start, end int) []Token { + return p.tokens[start:end] +} + +// Cursor position in tokens (includes elided tokens). func (p *PeekingLexer) Cursor() int { return p.cursor } // Next consumes and returns the next token. func (p *PeekingLexer) Next() (Token, error) { - if p.cursor >= len(p.tokens) { - return p.eof, nil + for p.cursor < len(p.tokens) { + t := p.tokens[p.cursor] + p.cursor++ + if p.elide[t.Type] { + continue + } + return p.tokens[p.cursor-1], nil } - p.cursor++ - return p.tokens[p.cursor-1], nil + return p.eof, nil } // Peek ahead at the n+1 token. ie. Peek(0) will peek at the next token. func (p *PeekingLexer) Peek(n int) (Token, error) { - i := p.cursor + n - if i >= len(p.tokens) { - return p.eof, nil + for i := p.cursor; i < len(p.tokens); i++ { + t := p.tokens[i] + if p.elide[t.Type] { + continue + } + if n == 0 { + return t, nil + } + n-- } - return p.tokens[i], nil + return p.eof, nil } // Clone creates a clone of this PeekingLexer at its current token. diff --git a/v2/lexer/peek_test.go b/v2/lexer/peek_test.go index 178324ac..88988ed2 100644 --- a/v2/lexer/peek_test.go +++ b/v2/lexer/peek_test.go @@ -21,8 +21,10 @@ func (s *staticLexer) Next() (Token, error) { func TestUpgrade(t *testing.T) { t0 := Token{Type: 1, Value: "moo"} + ts := Token{Type: 3, Value: " "} t1 := Token{Type: 2, Value: "blah"} - l, err := Upgrade(&staticLexer{tokens: []Token{t0, t1}}) + tokens := []Token{t0, ts, t1} + l, err := Upgrade(&staticLexer{tokens: tokens}, 3) require.NoError(t, err) require.Equal(t, t0, mustPeek(t, l, 0)) require.Equal(t, t0, mustPeek(t, l, 0)) @@ -30,6 +32,7 @@ func TestUpgrade(t *testing.T) { require.Equal(t, t1, mustPeek(t, l, 1)) require.True(t, mustPeek(t, l, 2).EOF()) require.True(t, mustPeek(t, l, 3).EOF()) + require.Equal(t, tokens, l.Range(0, 3)) } func mustPeek(t *testing.T, lexer *PeekingLexer, n int) Token { diff --git a/v2/map.go b/v2/map.go index 43d5ea1f..4a2b14cd 100644 --- a/v2/map.go +++ b/v2/map.go @@ -79,9 +79,10 @@ func Upper(types ...string) Option { // Elide drops tokens of the specified types. func Elide(types ...string) Option { - return Map(func(token lexer.Token) (lexer.Token, error) { - return lexer.Token{}, DropToken - }, types...) + return func(p *Parser) error { + p.elide = append(p.elide, types...) + return nil + } } // Apply a Mapping to all tokens coming out of a Lexer. diff --git a/v2/nodes.go b/v2/nodes.go index 2df1f212..fb7e5b4b 100644 --- a/v2/nodes.go +++ b/v2/nodes.go @@ -16,6 +16,7 @@ var ( MaxIterations = 1000000 positionType = reflect.TypeOf(lexer.Position{}) + tokensType = reflect.TypeOf([]lexer.Token{}) captureType = reflect.TypeOf((*Capture)(nil)).Elem() textUnmarshalerType = reflect.TypeOf((*encoding.TextUnmarshaler)(nil)).Elem() parseableType = reflect.TypeOf((*Parseable)(nil)).Elem() @@ -71,48 +72,35 @@ func (p *parseable) Parse(ctx *parseContext, parent reflect.Value) (out []reflec type strct struct { typ reflect.Type expr node + tokensFieldIndex []int posFieldIndex []int endPosFieldIndex []int } func newStrct(typ reflect.Type) *strct { - var ( - posFieldIndex []int - endPosFieldIndex []int - ) + s := &strct{ + typ: typ, + } field, ok := typ.FieldByName("Pos") if ok && field.Type == positionType { - posFieldIndex = field.Index + s.posFieldIndex = field.Index } field, ok = typ.FieldByName("EndPos") if ok && field.Type == positionType { - endPosFieldIndex = field.Index + s.endPosFieldIndex = field.Index } - return &strct{ - typ: typ, - posFieldIndex: posFieldIndex, - endPosFieldIndex: endPosFieldIndex, + field, ok = typ.FieldByName("Tokens") + if ok && field.Type == tokensType { + s.tokensFieldIndex = field.Index } + return s } func (s *strct) String() string { return stringer(s) } -func (s *strct) maybeInjectStartToken(token lexer.Token, v reflect.Value) { - if s.posFieldIndex == nil { - return - } - v.FieldByIndex(s.posFieldIndex).Set(reflect.ValueOf(token.Pos)) -} - -func (s *strct) maybeInjectEndToken(token lexer.Token, v reflect.Value) { - if s.endPosFieldIndex == nil { - return - } - v.FieldByIndex(s.endPosFieldIndex).Set(reflect.ValueOf(token.Pos)) -} - func (s *strct) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { sv := reflect.New(s.typ).Elem() + start := ctx.Cursor() t, err := ctx.Peek(0) if err != nil { return nil, err @@ -125,11 +113,34 @@ func (s *strct) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Va } else if out == nil { return nil, nil } + end := ctx.Cursor() t, _ = ctx.Peek(0) s.maybeInjectEndToken(t, sv) + s.maybeInjectTokens(ctx.Range(start, end), sv) return []reflect.Value{sv}, ctx.Apply() } +func (s *strct) maybeInjectStartToken(token lexer.Token, v reflect.Value) { + if s.posFieldIndex == nil { + return + } + v.FieldByIndex(s.posFieldIndex).Set(reflect.ValueOf(token.Pos)) +} + +func (s *strct) maybeInjectEndToken(token lexer.Token, v reflect.Value) { + if s.endPosFieldIndex == nil { + return + } + v.FieldByIndex(s.endPosFieldIndex).Set(reflect.ValueOf(token.Pos)) +} + +func (s *strct) maybeInjectTokens(tokens []lexer.Token, v reflect.Value) { + if s.tokensFieldIndex == nil { + return + } + v.FieldByIndex(s.tokensFieldIndex).Set(reflect.ValueOf(tokens)) +} + type groupMatchMode int const ( diff --git a/v2/parser.go b/v2/parser.go index 9168b253..cc9abb04 100644 --- a/v2/parser.go +++ b/v2/parser.go @@ -16,6 +16,7 @@ type Parser struct { useLookahead int caseInsensitive map[string]bool mappers []mapperByToken + elide []string } // MustBuild calls Build(grammar, options...) and panics if an error occurs. @@ -46,9 +47,9 @@ func Build(grammar interface{}, options ...Option) (parser *Parser, err error) { } } + symbols := p.lex.Symbols() if len(p.mappers) > 0 { mappers := map[rune][]Mapper{} - symbols := p.lex.Symbols() for _, mapper := range p.mappers { if len(mapper.symbols) == 0 { mappers[lexer.EOF] = append(mappers[lexer.EOF], mapper.mapper) @@ -161,7 +162,7 @@ func (p *Parser) ParseReader(filename string, r io.Reader, v interface{}, option if err != nil { return err } - peeker, err := lexer.Upgrade(lex) + peeker, err := lexer.Upgrade(lex, p.getElidedTypes()...) if err != nil { return err } @@ -177,7 +178,7 @@ func (p *Parser) ParseString(filename string, s string, v interface{}, options . if err != nil { return err } - peeker, err := lexer.Upgrade(lex) + peeker, err := lexer.Upgrade(lex, p.getElidedTypes()...) if err != nil { return err } @@ -193,7 +194,7 @@ func (p *Parser) ParseBytes(filename string, b []byte, v interface{}, options .. if err != nil { return err } - peeker, err := lexer.Upgrade(lex) + peeker, err := lexer.Upgrade(lex, p.getElidedTypes()...) if err != nil { return err } @@ -266,3 +267,16 @@ func (p *Parser) rootParseable(ctx *parseContext, parseable Parseable) error { } return nil } + +func (p *Parser) getElidedTypes() []rune { + symbols := p.lex.Symbols() + elideTypes := make([]rune, 0, len(p.elide)) + for _, elide := range p.elide { + rn, ok := symbols[elide] + if !ok { + panic(fmt.Errorf("Elide() uses unknown token %q", elide)) + } + elideTypes = append(elideTypes, rn) + } + return elideTypes +} diff --git a/v2/parser_test.go b/v2/parser_test.go index 20e28f7b..d26696de 100644 --- a/v2/parser_test.go +++ b/v2/parser_test.go @@ -1273,3 +1273,40 @@ func TestNegationWithDisjunction(t *testing.T) { require.Equal(t, &[]string{"hello", "world", ","}, ast.EverythingMoreComplex) } + +func TestASTTokens(t *testing.T) { + type subject struct { + Tokens []lexer.Token + + Word string `@Ident` + } + + type hello struct { + Tokens []lexer.Token + + Subject subject `"hello" @@` + } + + p := mustTestParser(t, &hello{}, + participle.Elide("Whitespace"), + participle.Lexer(lexer.Must(stateful.NewSimple([]stateful.Rule{ + {"Ident", `\w+`, nil}, + {"Whitespace", `\s+`, nil}, + })))) + actual := &hello{} + err := p.ParseString("", "hello world", actual) + require.NoError(t, err) + tokens := []lexer.Token{ + {-2, "hello", lexer.Position{Line: 1, Column: 1}}, + {-3, " ", lexer.Position{Offset: 5, Line: 1, Column: 6}}, + {-2, "world", lexer.Position{Offset: 6, Line: 1, Column: 7}}, + } + expected := &hello{ + Tokens: tokens, + Subject: subject{ + Tokens: tokens[1:], + Word: "world", + }, + } + require.Equal(t, expected, actual) +}