From 06c8481f79060373c453811792ce4192f1e2227f Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Fri, 5 Jun 2026 11:32:22 -0700 Subject: [PATCH] Auto defuse outputs The commit makes it so values written in every format except BSUP and CSUP are automatically defused. The -fusion flag is added for users that want to view fusion values in the default SUP format. Also changes error message in the csv error writer when multiple records are encountered advising the user to now use the 'blend' operator instead of 'fuse'. --- .linkspector.yml | 1 + book/src/super-sql/operators/fuse.md | 6 +- cli/outputflags/flags.go | 1 + db/ztests/meta.yaml | 2 +- mdtest/mdtest.go | 4 ++ mdtest/test.go | 8 ++- runtime/ztests/expr/array-expr.yaml | 2 + runtime/ztests/expr/function/nullif.yaml | 2 + runtime/ztests/expr/function/upcast.yaml | 2 + runtime/ztests/expr/fusion-all.yaml | 2 + runtime/ztests/expr/search-glob.yaml | 2 +- .../expr/search-nested-field-regexp.yaml | 2 +- runtime/ztests/expr/search-nested-field.yaml | 4 +- runtime/ztests/expr/search-primitives.yaml | 2 +- runtime/ztests/op/aggregate/collect.yaml | 2 + runtime/ztests/op/fuse.yaml | 18 ++++++ runtime/ztests/op/unnest.yaml | 2 + service/ztests/csv-error.yaml | 2 +- service/ztests/query-csv-error.yaml | 2 +- sio/anyio/writer.go | 63 ++++++++++++++----- sio/csvio/writer.go | 2 +- sio/csvio/ztests/fusion.yaml | 8 ++- 22 files changed, 106 insertions(+), 33 deletions(-) diff --git a/.linkspector.yml b/.linkspector.yml index 8c2cd991ca..4c32850c40 100644 --- a/.linkspector.yml +++ b/.linkspector.yml @@ -15,3 +15,4 @@ ignorePatterns: - pattern: '^https://dl.acm.org/doi/pdf/10.1145/984549.984551$' - pattern: '^https://www.researchgate.net/publication/221325979_Union_Types_for_Semistructured_Data$' - pattern: '^https://db.cs.cmu.edu/papers/2024/whatgoesaround-sigmodrec2024.pdf$' + - pattern: '^https://openproceedings.org/2017/conf/edbt/paper-62.pdf$' diff --git a/book/src/super-sql/operators/fuse.md b/book/src/super-sql/operators/fuse.md index eddd4016f7..72e65b29fe 100644 --- a/book/src/super-sql/operators/fuse.md +++ b/book/src/super-sql/operators/fuse.md @@ -29,7 +29,7 @@ Because all values of the input must be read to compute the fused type, --- _Fuse two records_ -```mdtest-spq +```mdtest-spq fusion # spq fuse # input @@ -43,7 +43,7 @@ fusion({a?:_::int64,b?:2},<{b:int64}>) --- _Fuse records with type variation_ -```mdtest-spq +```mdtest-spq fusion # spq fuse # input @@ -57,7 +57,7 @@ fuse --- _Fuse records with complex type variation_ -```mdtest-spq {data-layout="stacked"} +```mdtest-spq fusion {data-layout="stacked"} # spq fuse # input diff --git a/cli/outputflags/flags.go b/cli/outputflags/flags.go index 0beeeddcf7..5604c86843 100644 --- a/cli/outputflags/flags.go +++ b/cli/outputflags/flags.go @@ -73,6 +73,7 @@ func (f *Flags) SetFormatFlags(fs *flag.FlagSet) { fs.BoolVar(&f.forceBinary, "B", false, "allow Super Binary to be sent to a terminal output") fs.BoolVar(&f.jsonPretty, "J", false, "use formatted JSON output independent of -f option") fs.BoolVar(&f.jsonShortcut, "j", false, "use line-oriented JSON output independent of -f option") + fs.BoolVar(&f.SUPFusion, "fusion", false, "display fusion values (fusion values are otherwise auto-defused)") fs.BoolVar(&f.supPretty, "S", false, "use formatted Super JSON output independent of -f option") fs.BoolVar(&f.supShortcut, "s", false, "use line-oriented Super JSON output independent of -f option") } diff --git a/db/ztests/meta.yaml b/db/ztests/meta.yaml index f861036dbb..6bfdc9c0a0 100644 --- a/db/ztests/meta.yaml +++ b/db/ztests/meta.yaml @@ -7,7 +7,7 @@ script: | super db load -q -use poolB b.sup super db -S -c 'from :pools | drop id | sort name | drop ts' echo === - super db -S -c 'from poolA@main:objects | {nameof:nameof(this),...this} | drop id' + super db -fusion -S -c 'from poolA@main:objects | {nameof:nameof(this),...this} | drop id' super db -S -c 'from poolA:log | cut nameof(this)' inputs: diff --git a/mdtest/mdtest.go b/mdtest/mdtest.go index 037d61d493..4cb3b73d4d 100644 --- a/mdtest/mdtest.go +++ b/mdtest/mdtest.go @@ -233,11 +233,14 @@ func parseMarkdown(source []byte) (map[string]string, []*Test, error) { }) case "mdtest-spq": var fails bool + var fusion bool var runtime string for _, word := range fcbInfoWords(fcb, source)[1:] { switch { case word == "fails": fails = true + case word == "fusion": + fusion = true case strings.HasPrefix(word, "runtime="): runtime = strings.TrimPrefix(word, "runtime=") if runtime != "vam" && runtime != "sam" { @@ -263,6 +266,7 @@ func parseMarkdown(source []byte) (map[string]string, []*Test, error) { Input: sections[2], SPQ: sections[1], Runtime: runtime, + Fusion: fusion, }) } return ast.WalkContinue, nil diff --git a/mdtest/test.go b/mdtest/test.go index 487b25f890..8091de7e0d 100644 --- a/mdtest/test.go +++ b/mdtest/test.go @@ -23,8 +23,9 @@ type Test struct { Runtime string // "sam", "vam", or "" for both // For SPQ tests - Input string - SPQ string + Input string + SPQ string + Fusion bool // If true do not defuse output } // Run runs the test, returning nil on success. @@ -52,6 +53,9 @@ func (t *Test) run(runtime string) error { var c *exec.Cmd if t.SPQ != "" { c = exec.Command("super", "-s", "-c", t.SPQ) + if t.Fusion { + c.Args = append(c.Args, "-fusion") + } if s := t.Input; strings.TrimSpace(s) != "" { c.Args = append(c.Args, "-") c.Stdin = strings.NewReader(s) diff --git a/runtime/ztests/expr/array-expr.yaml b/runtime/ztests/expr/array-expr.yaml index 07f9e7238a..1b5bbadadf 100644 --- a/runtime/ztests/expr/array-expr.yaml +++ b/runtime/ztests/expr/array-expr.yaml @@ -12,6 +12,8 @@ input: | {a:0,b:"1"::(int64|string),c:null::(int64|string|null)} {a:0,b:1,c:fusion([2]::(null|[int64]),<[int64]>)} +output-flags: -fusion + output: | [error("missing"),error("quiet"),null] [[1,2],null,[3,4]] diff --git a/runtime/ztests/expr/function/nullif.yaml b/runtime/ztests/expr/function/nullif.yaml index 8e757e762d..97086549eb 100644 --- a/runtime/ztests/expr/function/nullif.yaml +++ b/runtime/ztests/expr/function/nullif.yaml @@ -17,6 +17,8 @@ input: | [error(0), null] [error(0), error(1)] +output-flags: -fusion + output: | null null diff --git a/runtime/ztests/expr/function/upcast.yaml b/runtime/ztests/expr/function/upcast.yaml index 7c2952d8e8..9ad531e5b7 100644 --- a/runtime/ztests/expr/function/upcast.yaml +++ b/runtime/ztests/expr/function/upcast.yaml @@ -59,6 +59,8 @@ input: | type n101=int64 [1::n101,] +output-flags: -fusion + output: | error({message:"upcast: value not a subtype of [int8|string]",on:[1,"a"]}) [1::int8,"a"] diff --git a/runtime/ztests/expr/fusion-all.yaml b/runtime/ztests/expr/fusion-all.yaml index e44435bd66..ed455b250f 100644 --- a/runtime/ztests/expr/fusion-all.yaml +++ b/runtime/ztests/expr/fusion-all.yaml @@ -9,6 +9,8 @@ input: | {x:1} +output-flags: -fusion + output: | fusion(0x02::all,) fusion(0x666f6f::all,) diff --git a/runtime/ztests/expr/search-glob.yaml b/runtime/ztests/expr/search-glob.yaml index 1a3a4708a2..61932a5eca 100644 --- a/runtime/ztests/expr/search-glob.yaml +++ b/runtime/ztests/expr/search-glob.yaml @@ -11,4 +11,4 @@ output: | {a:"foox",b:"there"} {a:"hello",b:"foox"} {a:"",b:"foo"} - fusion({a?:_::string,b?:"fool"},<{b:string}>) + {b:"fool"} diff --git a/runtime/ztests/expr/search-nested-field-regexp.yaml b/runtime/ztests/expr/search-nested-field-regexp.yaml index 830cc50c61..b5ac239f62 100644 --- a/runtime/ztests/expr/search-nested-field-regexp.yaml +++ b/runtime/ztests/expr/search-nested-field-regexp.yaml @@ -12,5 +12,5 @@ input: | output: | {a:[{bar:"foo"}]} {a:[{car:"foo"}]} - fusion({car:"foo"}::(null|{car:string}),<{car:string}>) + {car:"foo"} {a:[]::[{bar:null}]} diff --git a/runtime/ztests/expr/search-nested-field.yaml b/runtime/ztests/expr/search-nested-field.yaml index 9a4823b6b8..1059b93128 100644 --- a/runtime/ztests/expr/search-nested-field.yaml +++ b/runtime/ztests/expr/search-nested-field.yaml @@ -13,6 +13,6 @@ input: | output: | {a:[{b:"foo"}]} {a:[{c:"foo"},{b:1}]} - fusion({a:1,b?:_::string},<{a:int64}>) - fusion({a:2,b?:"foo"},<{a:int64,b:string}>) + {a:1} + {a:2,b:"foo"} {a:[]::[{b:null}]} diff --git a/runtime/ztests/expr/search-primitives.yaml b/runtime/ztests/expr/search-primitives.yaml index 13e11c3a95..2095ef90fa 100644 --- a/runtime/ztests/expr/search-primitives.yaml +++ b/runtime/ztests/expr/search-primitives.yaml @@ -9,4 +9,4 @@ input: | output: | "foo" "foo" - fusion("foo"::(int64|string),) + "foo" diff --git a/runtime/ztests/op/aggregate/collect.yaml b/runtime/ztests/op/aggregate/collect.yaml index b7f6fa67cf..57b2778d84 100644 --- a/runtime/ztests/op/aggregate/collect.yaml +++ b/runtime/ztests/op/aggregate/collect.yaml @@ -11,6 +11,8 @@ input: | 1::foo fusion(1::(int64|string),) +output-flags: -fusion + output: | type foo=int64 [{a:1},{a:2},{b:1.5},error("missing"),1::foo,fusion(1::(int64|string),)] diff --git a/runtime/ztests/op/fuse.yaml b/runtime/ztests/op/fuse.yaml index 910ea849f2..05faf8cd45 100644 --- a/runtime/ztests/op/fuse.yaml +++ b/runtime/ztests/op/fuse.yaml @@ -6,6 +6,8 @@ input: | {a:"goodnight",b:123::int32} {a:null,b:null,c:null} +output-flags: -fusion + output: | fusion({a:fusion("hello"::(string|null|none),),b:fusion("world"::(int32|string|null),),c:fusion(_::(string|null|none),)},<{a:string,b:string}>) fusion({a:fusion(_::(string|null|none),),b:fusion("goodnight"::(int32|string|null),),c:fusion("gracie"::(string|null|none),)},<{b:string,c:string}>) @@ -24,6 +26,8 @@ input: | [{a:3,b:3}] [{a:null,b:null}] +output-flags: -fusion + output: | fusion([fusion({a:fusion(1::(int64|null|none),),b:fusion(_::(int64|null|none),)},<{a:int64}>)],<[{a:int64}]>) fusion([fusion({a:fusion(_::(int64|null|none),),b:fusion(2::(int64|null|none),)},<{b:int64}>)],<[{b:int64}]>) @@ -44,6 +48,8 @@ input: | [1] ["s"] +output-flags: -fusion + output: | fusion({a:fusion(1::(int64|string),)}::(int64|string|{a:fusion(int64|string)}|[fusion(int64|string)]),<{a:int64}>) fusion({a:fusion("s"::(int64|string),)}::(int64|string|{a:fusion(int64|string)}|[fusion(int64|string)]),<{a:string}>) @@ -62,6 +68,8 @@ input: | {r:{y:4::int32,z:5::int32},s:"world",r2:{x:6::int32}} {a:null,r:{x:null,y:null,z:null},s:null,r2:null} +output-flags: -fusion + output: | fusion({a:fusion("hello"::(string|null|none),),r:fusion({x:fusion(1::int32::(int32|null|none),),y:fusion(2::int32::(int32|null),),z:fusion(_::(int32|null|none),)},<{x:int32,y:int32}>),s:fusion(_::(string|null|none),),r2:fusion(_::(null|none|{x:int32}),)},<{a:string,r:{x:int32,y:int32}}>) fusion({a:fusion(_::(string|null|none),),r:fusion({x:fusion(_::(int32|null|none),),y:fusion(4::int32::(int32|null),),z:fusion(5::int32::(int32|null|none),)},<{y:int32,z:int32}>),s:fusion("world"::(string|null|none),),r2:fusion({x:6::int32}::(null|none|{x:int32}),<{x:int32}>)},<{r:{y:int32,z:int32},s:string,r2:{x:int32}}>) @@ -75,6 +83,8 @@ input: | error(1) error("s") +output-flags: -fusion + output: | fusion(error(fusion(1::(int64|string),)),) fusion(error(fusion("s"::(int64|string),)),) @@ -88,6 +98,8 @@ input: | ["foo"] [null] +output-flags: -fusion + output: | fusion([fusion(1::(int64|string|null),),fusion(2::(int64|string|null),)],<[int64]>) fusion([fusion("foo"::(int64|string|null),)],<[string]>) @@ -102,6 +114,8 @@ input: | {a:["foo"]} {a:[null]} +output-flags: -fusion + output: | {a:fusion([fusion(1::(int64|string|null),),fusion(2::(int64|string|null),)],<[int64]>)} {a:fusion([fusion("foo"::(int64|string|null),)],<[string]>)} @@ -137,6 +151,8 @@ input: | type er2=error(er1) error(1)::er2 +output-flags: -fusion + output: | type a1=int64 type a2=[a1] @@ -175,6 +191,8 @@ input: | [fusion({x?:1,y?:_::int64},<{x:int64}>),fusion({x?:_::int64,y?:2},<{y:int64}>)] [1,fusion(2::(int64|null),)] +output-flags: -fusion + output: | {x:1} fusion({x?:1,y?:_::int64},<{x:int64}>) diff --git a/runtime/ztests/op/unnest.yaml b/runtime/ztests/op/unnest.yaml index 8f873d9d18..fe36b62f3d 100644 --- a/runtime/ztests/op/unnest.yaml +++ b/runtime/ztests/op/unnest.yaml @@ -9,6 +9,8 @@ input: | [fusion(12::(int64|string),),"13"] null +output-flags: -fusion + output: | 1 2 diff --git a/service/ztests/csv-error.yaml b/service/ztests/csv-error.yaml index 112135bf36..9f96ef960e 100644 --- a/service/ztests/csv-error.yaml +++ b/service/ztests/csv-error.yaml @@ -18,4 +18,4 @@ outputs: hello - name: stderr data: | - CSV output requires uniform records but multiple types encountered (consider 'fuse') + CSV output requires uniform records but multiple types encountered (consider 'blend') diff --git a/service/ztests/query-csv-error.yaml b/service/ztests/query-csv-error.yaml index 13f87a3912..643e27d99c 100644 --- a/service/ztests/query-csv-error.yaml +++ b/service/ztests/query-csv-error.yaml @@ -19,4 +19,4 @@ outputs: hello - name: stderr data: | - CSV output requires uniform records but multiple types encountered (consider 'fuse') + CSV output requires uniform records but multiple types encountered (consider 'blend') diff --git a/sio/anyio/writer.go b/sio/anyio/writer.go index fd90725d62..3b4179744b 100644 --- a/sio/anyio/writer.go +++ b/sio/anyio/writer.go @@ -4,7 +4,10 @@ import ( "fmt" "io" + "github.com/brimdata/super" "github.com/brimdata/super/csup" + "github.com/brimdata/super/runtime/vam/expr" + "github.com/brimdata/super/runtime/vam/expr/function" "github.com/brimdata/super/sio/arrowio" "github.com/brimdata/super/sio/bsupio" "github.com/brimdata/super/sio/csvio" @@ -20,18 +23,19 @@ import ( ) type WriterOpts struct { - Format string - BSUP *bsupio.WriterOpts // Nil means use defaults via bsupio.NewWriter. - CSV csvio.WriterOpts - DB dbio.WriterOpts - JSON jsonio.WriterOpts - SUP supio.WriterOpts + Format string + SUPFusion bool + BSUP *bsupio.WriterOpts // Nil means use defaults via bsupio.NewWriter. + CSV csvio.WriterOpts + DB dbio.WriterOpts + JSON jsonio.WriterOpts + SUP supio.WriterOpts } func NewWriter(w io.WriteCloser, opts WriterOpts) (vio.PushCloser, error) { switch opts.Format { case "arrows": - return arrowio.NewWriter(w), nil + return newDefuser(arrowio.NewWriter(w)), nil case "bsup": if opts.BSUP == nil { return bsupio.NewWriter(w), nil @@ -40,31 +44,58 @@ func NewWriter(w io.WriteCloser, opts WriterOpts) (vio.PushCloser, error) { case "csup": return csup.NewSerializer(w), nil case "csv": - return csvio.NewWriter(w, opts.CSV), nil + return newDefuser(csvio.NewWriter(w, opts.CSV)), nil case "db": - return dbio.NewWriter(w, opts.DB), nil + return newDefuser(dbio.NewWriter(w, opts.DB)), nil case "json": - return jsonio.NewWriter(w, opts.JSON), nil + return newDefuser(jsonio.NewWriter(w, opts.JSON)), nil case "line": - return lineio.NewWriter(w), nil + return newDefuser(lineio.NewWriter(w)), nil case "null": return &nullWriter{}, nil case "parquet": - return parquetio.NewWriter(w), nil + return newDefuser(parquetio.NewWriter(w)), nil case "sup", "": - return supio.NewWriter(w, opts.SUP), nil + w := vio.PushCloser(supio.NewWriter(w, opts.SUP)) + if !opts.SUPFusion { + w = newDefuser(w) + } + return w, nil case "table": - return tableio.NewWriter(w), nil + return newDefuser(tableio.NewWriter(w)), nil case "tsv": opts.CSV.Delim = '\t' - return csvio.NewWriter(w, opts.CSV), nil + return newDefuser(csvio.NewWriter(w, opts.CSV)), nil case "zeek": - return zeekio.NewWriter(w), nil + return newDefuser(zeekio.NewWriter(w)), nil default: return nil, fmt.Errorf("unknown format: %s", opts.Format) } } +type defuser struct { + vio.PushCloser + defuse expr.Function +} + +func newDefuser(w vio.PushCloser) vio.PushCloser { + return &defuser{PushCloser: w, defuse: function.NewDefuse(super.NewContext())} +} + +func (d *defuser) Push(vec vector.Any) error { + label, ok := vec.(*vector.Labeled) + if ok { + vec = label.Any + } + if vec != nil { + vec = vector.Apply(vector.ApplyNone, d.defuse.Call, vec) + } + if ok { + vec = &vector.Labeled{Any: vec, Label: label.Label} + } + return d.PushCloser.Push(vec) +} + type nullWriter struct{} func (*nullWriter) Push(vector.Any) error { diff --git a/sio/csvio/writer.go b/sio/csvio/writer.go index 5fc16f9068..f1c3aa0eff 100644 --- a/sio/csvio/writer.go +++ b/sio/csvio/writer.go @@ -16,7 +16,7 @@ import ( "github.com/brimdata/super/vector" ) -var ErrNotDataFrame = errors.New("CSV output requires uniform records but multiple types encountered (consider 'fuse')") +var ErrNotDataFrame = errors.New("CSV output requires uniform records but multiple types encountered (consider 'blend')") type Writer struct { writer io.WriteCloser diff --git a/sio/csvio/ztests/fusion.yaml b/sio/csvio/ztests/fusion.yaml index aad131a629..b30ecd0c79 100644 --- a/sio/csvio/ztests/fusion.yaml +++ b/sio/csvio/ztests/fusion.yaml @@ -7,6 +7,8 @@ input: | output-flags: -f csv output: | - x,y - 1, - ,foo + x + 1 + +error: | + CSV output requires uniform records but multiple types encountered (consider 'blend')