Permalink
Browse files

Speed up lexer by avoiding UTF-8 decode

Also add lexer benchmarks

benchmark                   old ns/op     new ns/op     delta
BenchmarkProgram-8          3180          2563          -19.40%
BenchmarkNames-8            5520          4901          -11.21%
BenchmarkKeywords-8         6338          5689          -10.24%
BenchmarkSimpleTokens-8     1480          902           -39.05%
BenchmarkChoiceTokens-8     3620          2065          -42.96%
BenchmarkNumbers-8          5770          4749          -17.69%
BenchmarkStrings-8          6552          5079          -22.48%
BenchmarkRegex-8            5516          4183          -24.17%
  • Loading branch information...
benhoyt committed Sep 7, 2018
1 parent e45e209 commit 0fa32f929b27bc55bcb8d68507853f1083d8ae02
Showing with 68 additions and 10 deletions.
  1. +4 −10 lexer/lexer.go
  2. +64 −0 lexer/lexer_test.go
@@ -10,7 +10,6 @@ package lexer

import (
"fmt"
"unicode/utf8"
)

// Lexer tokenizes a byte string of AWK source code. Use NewLexer to
@@ -335,24 +334,19 @@ func (l *Lexer) scanRegex() (Position, Token, string) {

func (l *Lexer) next() {
l.pos = l.nextPos
ch, size := utf8.DecodeRune(l.src[l.offset:])
if size == 0 {
if l.offset >= len(l.src) {
l.ch = -1
return
}
if ch == utf8.RuneError {
l.ch = -1
l.errorMsg = fmt.Sprintf("invalid UTF-8 byte 0x%02x", l.src[l.offset])
return
}
ch := l.src[l.offset]
if ch == '\n' {
l.nextPos.Line++
l.nextPos.Column = 1
} else {
l.nextPos.Column++
}
l.ch = ch
l.offset += size
l.ch = rune(ch)
l.offset++
}

func isNameStart(ch rune) bool {
@@ -174,6 +174,70 @@ func TestAllTokens(t *testing.T) {
}
}

func benchmarkLexer(b *testing.B, repeat int, source string) {
fullSource := []byte(strings.Repeat(source+"\n", repeat))
b.ResetTimer()
for i := 0; i < b.N; i++ {
l := NewLexer(fullSource)
for {
_, tok, _ := l.Scan()
if tok == EOF || tok == ILLEGAL {
break
}
}
}
}

func BenchmarkProgram(b *testing.B) {
benchmarkLexer(b, 5, `{ print $1, ($3+$4)*$5 }`)
}

func BenchmarkNames(b *testing.B) {
benchmarkLexer(b, 5, `x y i foobar abcdefghij0123456789 _`)
}

func BenchmarkKeywords(b *testing.B) {
benchmarkLexer(b, 5, `BEGIN END print sub if length`)
}

func BenchmarkSimpleTokens(b *testing.B) {
benchmarkLexer(b, 5, "\n : , { [ ( } ] ) ~ ? ; $")
}

func BenchmarkChoiceTokens(b *testing.B) {
benchmarkLexer(b, 5, `/ /= % %= + ++ += * ** **= *= = == ^ ^= ! != !~ < <= > >= >> && | ||`)
}

func BenchmarkNumbers(b *testing.B) {
benchmarkLexer(b, 5, `0 1 .5 1234 1234567890 1234.56789e-50`)
}

func BenchmarkStrings(b *testing.B) {
benchmarkLexer(b, 5, `"x" "y" "xyz" "foo" "foo bar baz" "foo\tbar\rbaz\n"`)
}

func BenchmarkRegex(b *testing.B) {
source := `/x/ /./ /foo/ /bar/ /=equals=/ /\/\/\/\//`
fullSource := []byte(strings.Repeat(source+" ", 5))
b.ResetTimer()
for i := 0; i < b.N; i++ {
l := NewLexer(fullSource)
for {
_, tok, _ := l.Scan()
if tok == EOF {
break
}
if tok != DIV && tok != DIV_ASSIGN {
b.Fatalf("expected / or /=, got %s", tok)
}
_, tok, _ = l.ScanRegex()
if tok != REGEX {
b.Fatalf("expected regex, got %s", tok)
}
}
}
}

func Example() {
lexer := NewLexer([]byte(`$0 { print $1 }`))
for {

0 comments on commit 0fa32f9

Please sign in to comment.