Skip to content

Commit

Permalink
Use char instead of byte indexes for length, substr, index, and match (
Browse files Browse the repository at this point in the history
…#83)

I've done this as a runtime config setting, Config.Bytes, and the
default is character mode. So this is a backwards-incompatible change,
but I think it's 1) warranted given GoAWK tries to conform to POSIX,
and 2) won't break most scripts, even ones that use non-ASCII, unless
they use constant indexes for substr() on non-ASCII strings.

This definitely affects performance, as some operations that were O(1)
are now O(N) in the length of the string. Here are the results for the
relevant functions (though obviously it'll depend on the string len):

name             old time/op  new time/op  delta
BuiltinLength-8   615ns ± 3%  4179ns ± 3%  +579.68%  (p=0.008 n=5+5)
BuiltinSubstr-8   985ns ± 0%  3874ns ± 0%  +293.12%  (p=0.008 n=5+5)
BuiltinIndex-8   1.24µs ± 0%  4.70µs ± 6%  +278.85%  (p=0.008 n=5+5)
BuiltinMatch-8   2.90µs ± 0%  3.30µs ± 3%   +13.70%  (p=0.008 n=5+5)
[Geo mean]       2.35µs       3.98µs        +69.30%

Fixes #35
  • Loading branch information
benhoyt committed Jan 3, 2022
1 parent 9174a5b commit b7ec795
Show file tree
Hide file tree
Showing 6 changed files with 354 additions and 226 deletions.
328 changes: 164 additions & 164 deletions benchmarks.txt

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion goawk.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ import (
)

const (
version = "v1.10.0"
version = "v1.11.0"
copyright = "GoAWK " + version + " - Copyright (c) 2021 Ben Hoyt"
shortUsage = "usage: goawk [-F fs] [-v var=value] [-f progfile | 'prog'] [file ...]"
longUsage = `Standard AWK arguments:
Expand All @@ -56,6 +56,7 @@ const (
load AWK source from progfile (multiple allowed)
Additional GoAWK arguments:
-b use byte indexes for index(), length(), match(), and substr()
-cpuprofile file
write CPU profile to file
-d debug mode (print parsed AST to stderr)
Expand All @@ -77,6 +78,7 @@ func main() {
debug := false
debugTypes := false
memprofile := ""
useBytes := false

var i int
for i = 1; i < len(os.Args); i++ {
Expand Down Expand Up @@ -109,6 +111,8 @@ func main() {
}
i++
vars = append(vars, os.Args[i])
case "-b":
useBytes = true
case "-cpuprofile":
if i+1 >= len(os.Args) {
errorExitf("flag needs an argument: -cpuprofile")
Expand Down Expand Up @@ -212,6 +216,7 @@ func main() {
config := &interp.Config{
Argv0: filepath.Base(os.Args[0]),
Args: expandWildcardsOnWindows(args),
Bytes: useBytes,
Vars: []string{"FS", fieldSep},
}
for _, v := range vars {
Expand Down
9 changes: 9 additions & 0 deletions goawk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,10 @@ func TestCommandLine(t *testing.T) {
{[]string{"-v", "RS=;", `$0`}, "a b;c\nd;e", "a b\nc\nd\ne\n", ""},
{[]string{"-vRS=;", `$0`}, "a b;c\nd;e", "a b\nc\nd\ne\n", ""},

// Byte index vs character index mode
{[]string{`{ print length } # !windows-gawk`}, "絵\n", "1\n", ""},
{[]string{"-b", `{ print length }`}, "絵\n", "3\n", ""},

// ARGV/ARGC handling
{[]string{`
BEGIN {
Expand Down Expand Up @@ -440,6 +444,11 @@ func runGoAWK(args []string, stdin string) (stdout, stderr string, err error) {
}

func runAWKs(t *testing.T, testArgs []string, testStdin, testOutput, testError string) {
for _, arg := range testArgs {
if strings.Contains(arg, "!"+runtime.GOOS+"-"+awkExe) {
t.Skipf("skipping on %s under %s", runtime.GOOS, awkExe)
}
}
cmd := exec.Command(awkExe, testArgs...)
if testStdin != "" {
cmd.Stdin = bytes.NewReader([]byte(testStdin))
Expand Down
108 changes: 83 additions & 25 deletions interp/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,49 +98,98 @@ func (p *interp) callBuiltin(op Token, argExprs []Expr) (value, error) {
// Then switch on the function for the ordinary functions
switch op {
case F_LENGTH:
switch len(args) {
case 0:
return num(float64(len(p.line))), nil
default:
return num(float64(len(p.toString(args[0])))), nil
var s string
if len(args) > 0 {
s = p.toString(args[0])
} else {
s = p.line
}
var n int
if p.bytes {
n = len(s)
} else {
n = utf8.RuneCountInString(s)
}
return num(float64(n)), nil

case F_MATCH:
re, err := p.compileRegex(p.toString(args[1]))
if err != nil {
return null(), err
}
loc := re.FindStringIndex(p.toString(args[0]))
s := p.toString(args[0])
loc := re.FindStringIndex(s)
if loc == nil {
p.matchStart = 0
p.matchLength = -1
return num(0), nil
}
p.matchStart = loc[0] + 1
p.matchLength = loc[1] - loc[0]
if p.bytes {
p.matchStart = loc[0] + 1
p.matchLength = loc[1] - loc[0]
} else {
p.matchStart = utf8.RuneCountInString(s[:loc[0]]) + 1
p.matchLength = utf8.RuneCountInString(s[loc[0]:loc[1]])
}
return num(float64(p.matchStart)), nil

case F_SUBSTR:
s := p.toString(args[0])
pos := int(args[1].num())
if pos > len(s) {
pos = len(s) + 1
}
if pos < 1 {
pos = 1
}
maxLength := len(s) - pos + 1
length := maxLength
if len(args) == 3 {
length = int(args[2].num())
if length < 0 {
length = 0
if p.bytes {
if pos > len(s) {
pos = len(s) + 1
}
if length > maxLength {
length = maxLength
if pos < 1 {
pos = 1
}
maxLength := len(s) - pos + 1
length := maxLength
if len(args) == 3 {
length = int(args[2].num())
if length < 0 {
length = 0
}
if length > maxLength {
length = maxLength
}
}
return str(s[pos-1 : pos-1+length]), nil
} else {
// Count characters till we get to pos.
chars := 1
start := 0
for start = range s {
chars++
if chars > pos {
break
}
}
if pos >= chars {
start = len(s)
}

// Count characters from start till we reach length.
var end int
if len(args) == 3 {
length := int(args[2].num())
chars = 0
for end = range s[start:] {
chars++
if chars > length {
break
}
}
if length >= chars {
end = len(s)
} else {
end += start
}
} else {
end = len(s)
}
return str(s[start:end]), nil
}
return str(s[pos-1 : pos-1+length]), nil

case F_SPRINTF:
s, err := p.sprintf(p.toString(args[0]), args[1:])
Expand All @@ -152,7 +201,16 @@ func (p *interp) callBuiltin(op Token, argExprs []Expr) (value, error) {
case F_INDEX:
s := p.toString(args[0])
substr := p.toString(args[1])
return num(float64(strings.Index(s, substr) + 1)), nil
index := strings.Index(s, substr)
if p.bytes {
return num(float64(index + 1)), nil
} else {
if index < 0 {
return num(float64(0)), nil
}
index = utf8.RuneCountInString(s[:index])
return num(float64(index + 1)), nil
}

case F_TOLOWER:
return str(strings.ToLower(p.toString(args[0]))), nil
Expand Down Expand Up @@ -495,7 +553,7 @@ func (p *interp) initNativeFuncs(funcs map[string]interface{}) error {
var errorType = reflect.TypeOf((*error)(nil)).Elem()

// Check that native function with given name is okay to call from
// AWK, return a *interp.Error if not. This checks that f is actually
// AWK, return an *interp.Error if not. This checks that f is actually
// a function, and that its parameter and return types are good.
func checkNativeFunc(name string, f interface{}) error {
if KeywordToken(name) != ILLEGAL {
Expand Down
7 changes: 7 additions & 0 deletions interp/interp.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ type interp struct {
exitStatus int
regexCache map[string]*regexp.Regexp
formatCache map[string]cachedFormat
bytes bool
}

// Various const configuration. Could make these part of Config if
Expand Down Expand Up @@ -201,6 +202,11 @@ type Config struct {
// array, for example []string{"USER", "bob", "HOME", "/home/bob"}.
// If nil (the default), values from os.Environ() are used.
Environ []string

// Set to true to use byte indexes instead of character indexes for
// the index, length, match, and substr functions. Note: the default
// was changed from bytes to characters in GoAWK version 1.11.
Bytes bool
}

// ExecProgram executes the parsed program using the given interpreter
Expand Down Expand Up @@ -241,6 +247,7 @@ func ExecProgram(program *Program, config *Config) (int, error) {
p.noExec = config.NoExec
p.noFileWrites = config.NoFileWrites
p.noFileReads = config.NoFileReads
p.bytes = config.Bytes
err := p.initNativeFuncs(config.Funcs)
if err != nil {
return 0, err
Expand Down
121 changes: 85 additions & 36 deletions interp/interp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -406,25 +406,29 @@ BEGIN {
{`BEGIN { print match("x food y", "fo"), RSTART, RLENGTH }`, "", "3 3 2\n", "", ""},
{`BEGIN { print match("x food y", "fox"), RSTART, RLENGTH }`, "", "0 0 -1\n", "", ""},
{`BEGIN { print match("x food y", /[fod]+/), RSTART, RLENGTH }`, "", "3 3 4\n", "", ""},
{`BEGIN { print match("絵 fööd y", /[föd]+/), RSTART, RLENGTH }`, "", "3 3 4\n", "", ""},
{`{ print length, length(), length("buzz"), length("") }`, "foo bar", "7 7 4 0\n", "", ""},
{`BEGIN { print length("a"), length("絵") } # !awk`, "", "1 1\n", "", ""},
{`BEGIN { print index("foo", "f"), index("foo0", 0), index("foo", "o"), index("foo", "x") }`, "", "1 4 2 0\n", "", ""},
{`BEGIN { print index("föö", "f"), index("föö0", 0), index("föö", "ö"), index("föö", "x") }`, "", "1 4 2 0\n", "", ""},
{`BEGIN { print atan2(1, 0.5), atan2(-1, 0) }`, "", "1.10715 -1.5708\n", "", ""},
{`BEGIN { print sprintf("%3d", 42) }`, "", " 42\n", "", ""},
{`BEGIN { print sprintf("%d", 12, 34) }`, "", "12\n", "", ""},
{`BEGIN { print sprintf("%d") }`, "", "", "format error: got 0 args, expected 1", "not enough arg"},
{`BEGIN { print sprintf("%d", 12, 34) }`, "", "12\n", "", ""},
{`BEGIN { print sprintf("% 5d", 42) }`, "", " 42\n", "", ""},
{`BEGIN { print substr("food", 1) }`, "", "food\n", "", ""},
{`BEGIN { print substr("food", 1, 2) }`, "", "fo\n", "", ""},
{`BEGIN { print substr("food", 1, 4) }`, "", "food\n", "", ""},
{`BEGIN { print substr("food", 1, 8) }`, "", "food\n", "", ""},
{`BEGIN { print substr("food", 2) }`, "", "ood\n", "", ""},
{`BEGIN { print substr("food", 2, 2) }`, "", "oo\n", "", ""},
{`BEGIN { print substr("food", 2, 3) }`, "", "ood\n", "", ""},
{`BEGIN { print substr("food", 2, 8) }`, "", "ood\n", "", ""},
{`BEGIN { print substr("food", 0, 8) }`, "", "food\n", "", ""},
{`BEGIN { print substr("food", -1, 8) }`, "", "food\n", "", ""},
{`BEGIN { print substr("food", 5, 8) }`, "", "\n", "", ""},
{`BEGIN { print substr("food", 1), substr("fööd", 1) } # !windows-gawk`, "", "food fööd\n", "", ""},
{`BEGIN { print substr("food", 1, 2), substr("fööd", 1, 2) } # !windows-gawk`, "", "fo fö\n", "", ""},
{`BEGIN { print substr("food", 1, 4), substr("fööd", 1, 4) } # !windows-gawk`, "", "food fööd\n", "", ""},
{`BEGIN { print substr("food", 1, 8), substr("fööd", 1, 8) } # !windows-gawk`, "", "food fööd\n", "", ""},
{`BEGIN { print substr("food", 2), substr("fööd", 2) } # !windows-gawk`, "", "ood ööd\n", "", ""},
{`BEGIN { print substr("food", 2, 2), substr("fööd", 2, 2) } # !windows-gawk`, "", "oo öö\n", "", ""},
{`BEGIN { print substr("food", 2, 3), substr("fööd", 2, 3) } # !windows-gawk`, "", "ood ööd\n", "", ""},
{`BEGIN { print substr("food", 2, 8), substr("fööd", 2, 8) } # !windows-gawk`, "", "ood ööd\n", "", ""},
{`BEGIN { print substr("food", 0, 8), substr("fööd", 0, 8) } # !windows-gawk`, "", "food fööd\n", "", ""},
{`BEGIN { print substr("food", -1, 8), substr("fööd", -1, 8) } # !windows-gawk`, "", "food fööd\n", "", ""},
{`BEGIN { print substr("food", 5, 8), substr("fööd", 5, 8) }`, "", " \n", "", ""},
{`BEGIN { print substr("food", 2, -3), substr("fööd", 2, -3) }`, "", " \n", "", ""},
{`BEGIN { n = split("ab c d ", a); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n", "", ""},
{`BEGIN { n = split("ab,c,d,", a, ","); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n\n", "", ""},
{`BEGIN { n = split("ab,c.d,", a, /[,.]/); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n\n", "", ""},
Expand Down Expand Up @@ -653,33 +657,37 @@ func TestInterp(t *testing.T) {
testName = testName[:70]
}

if awkExe != "" && !strings.Contains(test.src, "!"+awkExe) {
// Run it through external awk program first
t.Run("awk_"+testName, func(t *testing.T) {
cmd := exec.Command(awkExe, test.src, "-")
if test.in != "" {
cmd.Stdin = strings.NewReader(test.in)
}
out, err := cmd.CombinedOutput()
if err != nil {
if test.awkErr != "" {
if strings.Contains(string(out), test.awkErr) {
return
}
t.Fatalf("expected error %q, got:\n%s", test.awkErr, out)
} else {
t.Fatalf("error running %s: %v:\n%s", awkExe, err, out)
}
}
// Run it through external awk program first
t.Run("awk_"+testName, func(t *testing.T) {
if awkExe != "" && strings.Contains(test.src, "!"+awkExe) {
t.Skipf("skipping under %s", awkExe)
}
if strings.Contains(test.src, "!"+runtime.GOOS+"-"+awkExe) {
t.Skipf("skipping on %s under %s", runtime.GOOS, awkExe)
}
cmd := exec.Command(awkExe, test.src, "-")
if test.in != "" {
cmd.Stdin = strings.NewReader(test.in)
}
out, err := cmd.CombinedOutput()
if err != nil {
if test.awkErr != "" {
t.Fatalf(`expected error %q, got ""`, test.awkErr)
}
normalized := normalizeNewlines(string(out))
if normalized != test.out {
t.Fatalf("expected %q, got %q", test.out, normalized)
if strings.Contains(string(out), test.awkErr) {
return
}
t.Fatalf("expected error %q, got:\n%s", test.awkErr, out)
} else {
t.Fatalf("error running %s: %v:\n%s", awkExe, err, out)
}
})
}
}
if test.awkErr != "" {
t.Fatalf(`expected error %q, got ""`, test.awkErr)
}
normalized := normalizeNewlines(string(out))
if normalized != test.out {
t.Fatalf("expected %q, got %q", test.out, normalized)
}
})

// Then test it in GoAWK
t.Run(testName, func(t *testing.T) {
Expand Down Expand Up @@ -1028,6 +1036,47 @@ func TestSafeMode(t *testing.T) {
}
}

func TestBytesMode(t *testing.T) {
tests := []struct {
src string
in string
out string
}{
{`BEGIN { print match("food", "foo"), RSTART, RLENGTH }`, "", "1 1 3\n"},
{`BEGIN { print match("x food y", "fo"), RSTART, RLENGTH }`, "", "3 3 2\n"},
{`BEGIN { print match("x food y", "fox"), RSTART, RLENGTH }`, "", "0 0 -1\n"},
{`BEGIN { print match("x food y", /[fod]+/), RSTART, RLENGTH }`, "", "3 3 4\n"},
{`BEGIN { print match("絵 fööd y", /[föd]+/), RSTART, RLENGTH }`, "", "5 5 6\n"},
{`{ print length, length(), length("buzz"), length("") }`, "foo bar", "7 7 4 0\n"},
{`BEGIN { print length("a"), length("絵") } # !awk`, "", "1 3\n"},
{`BEGIN { print index("foo", "f"), index("foo0", 0), index("foo", "o"), index("foo", "x") }`, "", "1 4 2 0\n"},
{`BEGIN { print index("föö", "f"), index("föö0", 0), index("föö", "ö"), index("föö", "x") }`, "", "1 6 2 0\n"},
{`BEGIN { print substr("food", 1), substr("fööd", 1) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 1, 2), substr("fööd", 1, 2) }`, "", "fo f\xc3\n"},
{`BEGIN { print substr("food", 1, 4), substr("fööd", 1, 4) }`, "", "food fö\xc3\n"},
{`BEGIN { print substr("food", 1, 8), substr("fööd", 1, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 2), substr("fööd", 2) }`, "", "ood ööd\n"},
{`BEGIN { print substr("food", 2, 2), substr("fööd", 2, 2) }`, "", "oo ö\n"},
{`BEGIN { print substr("food", 2, 3), substr("fööd", 2, 3) }`, "", "ood ö\xc3\n"},
{`BEGIN { print substr("food", 2, 8), substr("fööd", 2, 8) }`, "", "ood ööd\n"},
{`BEGIN { print substr("food", 0, 8), substr("fööd", 0, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", -1, 8), substr("fööd", -1, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 5, 8), substr("fööd", 5, 8) }`, "", " \xb6d\n"},
{`BEGIN { print substr("food", 2, -3), substr("fööd", 2, -3) }`, "", " \n"},
}
for _, test := range tests {
testName := test.src
if len(testName) > 70 {
testName = testName[:70]
}
t.Run(testName, func(t *testing.T) {
testGoAWK(t, test.src, test.in, test.out, "", nil, func(config *interp.Config) {
config.Bytes = true
})
})
}
}

func TestConfigVarsCorrect(t *testing.T) {
prog, err := parser.ParseProgram([]byte(`BEGIN { print x }`), nil)
if err != nil {
Expand Down

0 comments on commit b7ec795

Please sign in to comment.