Initial refactor of lexing.

Extended lexer.Definition to support directly lexing strings and []byte slices. Remove ebnf and regex lexers. An adapter has been added for v0 lexers.
alecthomas · Sep 18, 2020 · 1444519 · 1444519
1 parent 2403858
commit 1444519
Show file tree

Hide file tree

Showing 35 changed files with 202 additions and 1,875 deletions.
diff --git a/_examples/basic/main.go b/_examples/basic/main.go
@@ -8,30 +8,25 @@ import (
 
 	"github.com/alecthomas/participle"
 	"github.com/alecthomas/participle/lexer"
-	"github.com/alecthomas/participle/lexer/ebnf"
+	"github.com/alecthomas/participle/lexer/stateful"
 )
 
 var (
-	basicLexer = lexer.Must(ebnf.New(`
-		Comment = ("REM" | "rem" ) { "\u0000"…"\uffff"-"\n"-"\r" } .
-		Ident = (alpha | "_") { "_" | alpha | digit } .
-		String = "\"" { "\u0000"…"\uffff"-"\""-"\\" | "\\" any } "\"" .
-		Number = [ "-" | "+" ] ("." | digit) { "." | digit } .
-		Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
-		EOL = ( "\n" | "\r" ) { "\n" | "\r" }.
-		Whitespace = ( " " | "\t" ) { " " | "\t" } .
-
-		alpha = "a"…"z" | "A"…"Z" .
-		digit = "0"…"9" .
-		any = "\u0000"…"\uffff" .
-	`))
+	basicLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
+		{"Comment", `(?i)rem[^\n]*`, nil},
+		{"String", `"(\\"|[^"])*"`, nil},
+		{"Number", `[-+]?(\d*\.)?\d+`, nil},
+		{"Ident", `[a-zA-Z_]\w*`, nil},
+		{"Punct", `[-[!@#$%^&*()+_={}\|:;"'<,>.?/]|]`, nil},
+		{"EOL", `[\n\r]+`, nil},
+		{"whitespace", `[ \t]+`, nil},
+	}))
 
 	basicParser = participle.MustBuild(&Program{},
 		participle.Lexer(basicLexer),
 		participle.CaseInsensitive("Ident"),
 		participle.Unquote("String"),
 		participle.UseLookahead(2),
-		participle.Elide("Whitespace"),
 	)
 
 	cli struct {

diff --git a/_examples/go.mod b/_examples/go.mod
@@ -4,7 +4,7 @@ go 1.14
 
 require (
 	github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2
-	github.com/alecthomas/kong v0.2.8
+	github.com/alecthomas/kong v0.2.11
 	github.com/alecthomas/participle v0.4.1
 	github.com/alecthomas/repr v0.0.0-20200325044227-4184120f674c
 	github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect

diff --git a/_examples/go.sum b/_examples/go.sum
@@ -3,6 +3,8 @@ github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxC
 github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI=
 github.com/alecthomas/kong v0.2.8 h1:VSWWkD1TZij2967FcfVwgRwlp3khCA0liZIkUI9hTdU=
 github.com/alecthomas/kong v0.2.8/go.mod h1:kQOmtJgV+Lb4aj+I2LEn40cbtawdWJ9Y8QLq+lElKxE=
+github.com/alecthomas/kong v0.2.11 h1:RKeJXXWfg9N47RYfMm0+igkxBCTF4bzbneAxaqid0c4=
+github.com/alecthomas/kong v0.2.11/go.mod h1:kQOmtJgV+Lb4aj+I2LEn40cbtawdWJ9Y8QLq+lElKxE=
 github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI=
 github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs=
 github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ=

diff --git a/_examples/graphql/main.go b/_examples/graphql/main.go
@@ -9,7 +9,7 @@ import (
 
 	"github.com/alecthomas/participle"
 	"github.com/alecthomas/participle/lexer"
-	"github.com/alecthomas/participle/lexer/ebnf"
+	"github.com/alecthomas/participle/lexer/stateful"
 )
 
 type File struct {
@@ -62,17 +62,13 @@ type Value struct {
 }
 
 var (
-	graphQLLexer = lexer.Must(ebnf.New(`
-Comment = ("#" | "//") { "\u0000"…"\uffff"-"\n" } .
-Ident = (alpha | "_") { "_" | alpha | digit } .
-Number = ("." | digit) {"." | digit} .
-Whitespace = " " | "\t" | "\n" | "\r" .
-Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
-
-alpha = "a"…"z" | "A"…"Z" .
-digit = "0"…"9" .
-`))
-
+	graphQLLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
+		{"Comment", `(?:#|//)[^\n]*\n?`, nil},
+		{"Ident", `[a-zA-Z]\w*`, nil},
+		{"Number", `(?:\d*\.)?\d+`, nil},
+		{"Punct", `[-[!@#$%^&*()+_={}\|:;"'<,>.?/]|]`, nil},
+		{"Whitespace", `[ \t\n\r]+`, nil},
+	}))
 	parser = participle.MustBuild(&File{},
 		participle.Lexer(graphQLLexer),
 		participle.Elide("Comment", "Whitespace"),

diff --git a/_examples/ini/main.go b/_examples/ini/main.go
@@ -5,20 +5,21 @@ import (
 
 	"github.com/alecthomas/participle"
 	"github.com/alecthomas/participle/lexer"
+	"github.com/alecthomas/participle/lexer/stateful"
+
 	"github.com/alecthomas/repr"
 )
 
 // A custom lexer for INI files. This illustrates a relatively complex Regexp lexer, as well
 // as use of the Unquote filter, which unquotes string tokens.
-var iniLexer = lexer.Must(lexer.Regexp(
-	`(?m)` +
-		`(\s+)` +
-		`|(^[#;].*$)` +
-		`|(?P<Ident>[a-zA-Z][a-zA-Z_\d]*)` +
-		`|(?P<String>"(?:\\.|[^"])*")` +
-		`|(?P<Float>\d+(?:\.\d+)?)` +
-		`|(?P<Punct>[][=])`,
-))
+var iniLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
+	{`Ident`, `[a-zA-Z][a-zA-Z_\d]*`, nil},
+	{`String`, `"(?:\\.|[^"])*"`, nil},
+	{`Float`, `\d+(?:\.\d+)?`, nil},
+	{`Punct`, `[][=]`, nil},
+	{"comment", `[#;][^\n]*`, nil},
+	{"whitespace", `\s+`, nil},
+}))
 
 type INI struct {
 	Properties []*Property `@@*`

diff --git a/_examples/sql/main.go b/_examples/sql/main.go
@@ -3,8 +3,11 @@ package main
 
 import (
 	"github.com/alecthomas/kong"
+
 	"github.com/alecthomas/participle"
 	"github.com/alecthomas/participle/lexer"
+	"github.com/alecthomas/participle/lexer/stateful"
+
 	"github.com/alecthomas/repr"
 )
 
@@ -156,13 +159,14 @@ var (
 		SQL string `arg:"" required:"" help:"SQL to parse."`
 	}
 
-	sqlLexer = lexer.Must(lexer.Regexp(`(\s+)` +
-		`|(?P<Keyword>(?i)SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|AS|IN)` +
-		`|(?P<Ident>[a-zA-Z_][a-zA-Z0-9_]*)` +
-		`|(?P<Number>[-+]?\d*\.?\d+([eE][-+]?\d+)?)` +
-		`|(?P<String>'[^']*'|"[^"]*")` +
-		`|(?P<Operators><>|!=|<=|>=|[-+*/%,.()=<>])`,
-	))
+	sqlLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
+		{`Keyword`, `(?i)SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|AS|IN`, nil},
+		{`Ident`, `[a-zA-Z_][a-zA-Z0-9_]*`, nil},
+		{`Number`, `[-+]?\d*\.?\d+([eE][-+]?\d+)?`, nil},
+		{`String`, `'[^']*'|"[^"]*"`, nil},
+		{`Operators`, `<>|!=|<=|>=|[-+*/%,.()=<>]`, nil},
+		{"whitespace", `\s+`, nil},
+	}))
 	sqlParser = participle.MustBuild(
 		&Select{},
 		participle.Lexer(sqlLexer),

diff --git a/_examples/toml/main.go b/_examples/toml/main.go
@@ -4,9 +4,11 @@ import (
 	"os"
 
 	"github.com/alecthomas/kong"
+
 	"github.com/alecthomas/participle"
 	"github.com/alecthomas/participle/lexer"
-	"github.com/alecthomas/participle/lexer/ebnf"
+	"github.com/alecthomas/participle/lexer/stateful"
+
 	"github.com/alecthomas/repr"
 )
 
@@ -32,8 +34,7 @@ type Value struct {
 	Date     *string  `| @Date`
 	Time     *string  `| @Time`
 	Bool     *bool    `| (@"true" | "false")`
-	Integer  *int64   `| @Int`
-	Float    *float64 `| @Float`
+	Number   *float64 `| @Number`
 	List     []*Value `| "[" [ @@ { "," @@ } ] "]"`
 }
 
@@ -43,28 +44,22 @@ type Section struct {
 }
 
 var (
-	tomlLexer = lexer.Must(ebnf.New(`
-		Comment = "#" { "\u0000"…"\uffff"-"\n" } .
-		DateTime = date "T" time [ "-" digit digit ":" digit digit ].
-		Date = date .
-		Time = time .
-		Ident = (alpha | "_") { "_" | alpha | digit } .
-		String = "\"" { "\u0000"…"\uffff"-"\""-"\\" | "\\" any } "\"" .
-		Int = [ "-" | "+" ] digit { digit } .
-		Float = ("." | digit) {"." | digit} .
-		Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
-		Whitespace = " " | "\t" | "\n" | "\r" .
-
-		alpha = "a"…"z" | "A"…"Z" .
-		digit = "0"…"9" .
-		any = "\u0000"…"\uffff" .
-		date = digit digit digit digit "-" digit digit "-" digit digit .
-		time = digit digit ":" digit digit ":" digit digit [ "." { digit } ] .
-	`))
+	tomlLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
+		{"DateTime", `\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(-\d\d:\d\d)?`, nil},
+		{"Date", `\d\d\d\d-\d\d-\d\d`, nil},
+		{"Time", `\d\d:\d\d:\d\d(\.\d+)?`, nil},
+		{"Ident", `[a-zA-Z_][a-zA-Z_0-9]*`, nil},
+		{"String", `"[^"]*"`, nil},
+		{"Number", `[-+]?[.0-9]+\b`, nil},
+		{"Punct", `\[|]|[-!()+/*=,]`, nil},
+		{"comment", `#[^\n]+`, nil},
+		{"whitespace", `\s+`, nil},
+	}))
 	tomlParser = participle.MustBuild(&TOML{},
-		participle.Lexer(tomlLexer),
+		participle.Lexer(
+			tomlLexer,
+		),
 		participle.Unquote("String"),
-		participle.Elide("Whitespace", "Comment"),
 	)
 
 	cli struct {

diff --git a/ebnf_test.go b/ebnf_test.go
@@ -1,4 +1,4 @@
-package participle
+package participle_test
 
 import (
 	"strings"

diff --git a/error_test.go b/error_test.go
@@ -1,9 +1,11 @@
-package participle
+package participle_test
 
 import (
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+
+	"github.com/alecthomas/participle"
 )
 
 func TestErrorReporting(t *testing.T) {
@@ -22,7 +24,7 @@ func TestErrorReporting(t *testing.T) {
 	type grammar struct {
 		Decls []*decl `( @@ ";" )*`
 	}
-	p := mustTestParser(t, &grammar{}, UseLookahead(5))
+	p := mustTestParser(t, &grammar{}, participle.UseLookahead(5))
 
 	var err error
 	ast := &grammar{}

diff --git a/lexer/adapters.go b/lexer/adapters.go
@@ -0,0 +1,46 @@
+package lexer
+
+import (
+	"bytes"
+	"io"
+	"strings"
+)
+
+type legacy struct {
+	legacy interface {
+		Lex(io.Reader) (Lexer, error)
+		Symbols() map[string]rune
+	}
+}
+
+func (l legacy) LexReader(r io.Reader) (Lexer, error) { return l.legacy.Lex(r) }
+func (l legacy) LexString(s string) (Lexer, error)    { return l.legacy.Lex(strings.NewReader(s)) }
+func (l legacy) LexBytes(b []byte) (Lexer, error)     { return l.legacy.Lex(bytes.NewReader(b)) }
+func (l legacy) Symbols() map[string]rune             { return l.legacy.Symbols() }
+
+// Legacy is a shim for Participle v0 lexer definitions.
+func Legacy(def interface {
+	Lex(io.Reader) (Lexer, error)
+	Symbols() map[string]rune
+}) Definition {
+	return legacy{def}
+}
+
+// Simple upgrades a lexer that only implements LexReader() by using
+// strings/bytes.NewReader().
+func Simple(def interface {
+	Symbols() map[string]rune
+	LexReader(io.Reader) (Lexer, error)
+}) Definition {
+	return simple{def}
+}
+
+type simplei interface {
+	Symbols() map[string]rune
+	LexReader(io.Reader) (Lexer, error)
+}
+
+type simple struct{ simplei }
+
+func (s simple) LexString(str string) (Lexer, error) { return s.LexReader(strings.NewReader(str)) }
+func (s simple) LexBytes(b []byte) (Lexer, error)    { return s.LexReader(bytes.NewReader(b)) }
diff --git a/lexer/doc.go b/lexer/doc.go
@@ -1,19 +1,5 @@
 // Package lexer defines interfaces and implementations used by Participle to perform lexing.
 //
-// The primary interfaces are Definition and Lexer. There are three implementations of these
-// interfaces:
-//
-// TextScannerLexer is based on text/scanner. This is the fastest, but least flexible, in that
-// tokens are restricted to those supported by that package. It can scan about 5M tokens/second on a
-// late 2013 15" MacBook Pro.
-//
-// The second lexer is constructed via the Regexp() function, mapping regexp capture groups
-// to tokens. The complete input source is read into memory, so it is unsuitable for large inputs.
-//
-// The final lexer provided accepts a lexical grammar in EBNF. Each capitalised production is a
-// lexical token supported by the resulting Lexer. This is very flexible, but a bit slower, scanning
-// around 730K tokens/second on the same machine, though it is currently completely unoptimised.
-// This could/should be converted to a table-based lexer.
-//
-// Lexer implementations must use Panic/Panicf to report errors.
+// The primary interfaces are Definition and Lexer. There is one concrete implementation included,
+// the stateful lexer.
 package lexer