Skip to content

Commit

Permalink
Make two-argument split() respect CSV input mode (#198)
Browse files Browse the repository at this point in the history
This is how awk and gawk's upcoming --csv feature works, so we want to
be consistent. Note that it is a breaking change (hopefully a minor
one, and only applies to people using CSV input mode).
  • Loading branch information
benhoyt committed Jul 14, 2023
1 parent d18c07a commit ec42ddc
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 7 deletions.
2 changes: 2 additions & 0 deletions docs/csv.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ Links to sections:

When in CSV input mode, GoAWK ignores the regular field and record separators (`FS` and `RS`), instead parsing input into records and fields using the CSV or TSV format. Fields can be accessed using the standard AWK numbered field syntax (for example, `$1` or `$5`), or using the GoAWK-specific [named field syntax](#named-field-syntax).

In addition, in CSV input mode the two-argument form of `split()` uses CSV field splitting and ignores `FS`. For example, `split("x,\"y,z\"", a)` would set `a[1] = "x"` and `a[2] = "y,z"`. The three-argument form of `split()` operates as usual.

To enable CSV input mode when using the `goawk` program, use the `-i mode` command line argument (`mode` must be quoted if it has spaces in it). You can also enable CSV input mode by setting the `INPUTMODE` special variable in the `BEGIN` block, or by using the [Go API](#go-api). The full syntax of `mode` is as follows:

```
Expand Down
30 changes: 25 additions & 5 deletions interp/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package interp

import (
"bufio"
"bytes"
"errors"
"fmt"
Expand Down Expand Up @@ -242,15 +243,34 @@ func validNativeType(typ reflect.Type) bool {
}

// Guts of the split() function
func (p *interp) split(s string, scope resolver.Scope, index int, fs string) (int, error) {
func (p *interp) split(s string, scope resolver.Scope, index int, fs string, mode IOMode) (int, error) {
var parts []string
if fs == " " {
switch {
case mode == CSVMode || mode == TSVMode:
// Set up for parsing a CSV/TSV record
splitter := csvSplitter{
separator: p.csvInputConfig.Separator,
sepLen: utf8.RuneLen(p.csvInputConfig.Separator),
comment: p.csvInputConfig.Comment,
fields: &parts,
}
scanner := bufio.NewScanner(strings.NewReader(s))
scanner.Split(splitter.scan)
if p.splitBuffer == nil {
p.splitBuffer = make([]byte, inputBufSize)
}
scanner.Buffer(p.splitBuffer, maxRecordLength)

// Parse one record. Errors shouldn't happen, but if there is one,
// len(parts) will be 0.
scanner.Scan()
case fs == " ":
parts = strings.Fields(s)
} else if s == "" {
case s == "":
// Leave parts 0 length on empty string
} else if utf8.RuneCountInString(fs) <= 1 {
case utf8.RuneCountInString(fs) <= 1:
parts = strings.Split(s, fs)
} else {
default:
re, err := p.compileRegex(fs)
if err != nil {
return 0, err
Expand Down
1 change: 1 addition & 0 deletions interp/interp.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ type interp struct {
shellCommand []string
csvOutput *bufio.Writer
noArgVars bool
splitBuffer []byte

// Scalars, arrays, and function state
globals []value
Expand Down
25 changes: 25 additions & 0 deletions interp/interp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1746,6 +1746,31 @@ var csvTests = []csvTest{
// Ignores UTF-8 byte order mark (BOM) at start of CSV file
{`BEGIN { INPUTMODE="csv" } { print $1=="foo" }`, "\ufefffoo,bar\n\ufefffoo,bar", "1\n0\n", "", nil},

// Two-argument split() parses in CSV mode if input mode is CSV
{`
BEGIN {
INPUTMODE = "csv"
split("foo,\"bar baz\",,x", a)
for (i=1; i in a; i++) print i ": " a[i]
split("a,b c,d", a)
for (i=1; i in a; i++) print i ": " a[i]
}`, "", "1: foo\n2: bar baz\n3: \n4: x\n1: a\n2: b c\n3: d\n", "", nil},
{`
BEGIN {
INPUTMODE = "tsv"
split("foo\tbar baz\t\tx", a)
for (i=1; i in a; i++) print i ": " a[i]
split("a\tb c\td", a)
for (i=1; i in a; i++) print i ": " a[i]
}`, "", "1: foo\n2: bar baz\n3: \n4: x\n1: a\n2: b c\n3: d\n", "", nil},
// Three-argument split() does not parse in CSV mode
{`
BEGIN {
INPUTMODE = "csv"
split("foo,\"bar baz\",,x", a, " ")
for (i=1; i in a; i++) print i ": " a[i]
}`, "", "1: foo,\"bar\n2: baz\",,x\n", "", nil},

// Error handling when parsing INPUTMODE and OUTPUTMODE
{`BEGIN { INPUTMODE="xyz" }`, "", "", `invalid input mode "xyz"`, nil},
{`BEGIN { INPUTMODE="csv separator=foo" }`, "", "", `invalid CSV/TSV separator "foo"`, nil},
Expand Down
5 changes: 3 additions & 2 deletions interp/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ func (p *interp) execute(code []compiler.Opcode) error {
arrayIndex := code[ip+1]
ip += 2
s := p.toString(p.peekTop())
n, err := p.split(s, resolver.Scope(arrayScope), int(arrayIndex), p.fieldSep)
n, err := p.split(s, resolver.Scope(arrayScope), int(arrayIndex), p.fieldSep, p.inputMode)
if err != nil {
return err
}
Expand All @@ -663,7 +663,8 @@ func (p *interp) execute(code []compiler.Opcode) error {
arrayIndex := code[ip+1]
ip += 2
s, fieldSep := p.peekPop()
n, err := p.split(p.toString(s), resolver.Scope(arrayScope), int(arrayIndex), p.toString(fieldSep))
// 3-argument form of split() ignores input mode
n, err := p.split(p.toString(s), resolver.Scope(arrayScope), int(arrayIndex), p.toString(fieldSep), DefaultMode)
if err != nil {
return err
}
Expand Down

0 comments on commit ec42ddc

Please sign in to comment.