Skip to content

Commit 9492274

Browse files
authored
feat(metadata): validated SortOrders constructor, validations & tests (#571)
1. adds a NewSortOrder constructor & makes SortOrder fields private to prevent mutation 2. NewSortOrder validates fields SortField, i.e. Transform / NullOrder / Direction are now mandatory fields 3. reuse sort-orders 4. checkCompatibility of sort-orders against schema 5. sort-orders & partition-specs are mandatory fields for format-version > 1 6. update tests & add new ones
1 parent 941d301 commit 9492274

16 files changed

+401
-168
lines changed

catalog/glue/glue_test.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -180,15 +180,12 @@ var testSchema = iceberg.NewSchemaWithIdentifiers(0, []int{},
180180
var testPartitionSpec = iceberg.NewPartitionSpec(
181181
iceberg.PartitionField{SourceID: 2, FieldID: 1000, Transform: iceberg.IdentityTransform{}, Name: "bar"})
182182

183-
var testSortOrder = table.SortOrder{
184-
OrderID: 1,
185-
Fields: []table.SortField{
186-
{
187-
SourceID: 1, Transform: iceberg.IdentityTransform{},
188-
Direction: table.SortASC, NullOrder: table.NullsLast,
189-
},
183+
var testSortOrder, _ = table.NewSortOrder(1, []table.SortField{
184+
{
185+
SourceID: 1, Transform: iceberg.IdentityTransform{},
186+
Direction: table.SortASC, NullOrder: table.NullsLast,
190187
},
191-
}
188+
})
192189

193190
func TestGlueGetTable(t *testing.T) {
194191
assert := require.New(t)

catalog/rest/rest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,7 @@ func (r *Catalog) CreateTable(ctx context.Context, identifier table.Identifier,
750750
o(&cfg)
751751
}
752752

753-
if cfg.SortOrder.Fields == nil && cfg.SortOrder.OrderID == 0 {
753+
if cfg.SortOrder.Fields() == nil && cfg.SortOrder.OrderID() == 0 {
754754
cfg.SortOrder = table.UnsortedSortOrder
755755
}
756756

catalog/sql/sql_test.go

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ func (s *SqliteCatalogTestSuite) TestCreateTableDefaultSortOrder() {
368368

369369
s.FileExists(strings.TrimPrefix(tbl.MetadataLocation(), "file://"))
370370

371-
s.Equal(0, tbl.SortOrder().OrderID)
371+
s.Equal(0, tbl.SortOrder().OrderID())
372372
s.NoError(tt.cat.DropTable(context.Background(), tt.tblID))
373373
}
374374
}
@@ -390,7 +390,7 @@ func (s *SqliteCatalogTestSuite) TestCreateV1Table() {
390390
s.Require().NoError(err)
391391

392392
s.FileExists(strings.TrimPrefix(tbl.MetadataLocation(), "file://"))
393-
s.Equal(0, tbl.SortOrder().OrderID)
393+
s.Equal(0, tbl.SortOrder().OrderID())
394394
s.Equal(1, tbl.Metadata().Version())
395395
s.True(tbl.Spec().Equals(*iceberg.UnpartitionedSpec))
396396
s.NoError(tt.cat.DropTable(context.Background(), tt.tblID))
@@ -410,20 +410,26 @@ func (s *SqliteCatalogTestSuite) TestCreateTableCustomSortOrder() {
410410
ns := catalog.NamespaceFromIdent(tt.tblID)
411411
s.Require().NoError(tt.cat.CreateNamespace(context.Background(), ns, nil))
412412

413-
order := table.SortOrder{Fields: []table.SortField{
414-
{SourceID: 2, Transform: iceberg.IdentityTransform{}, NullOrder: table.NullsFirst},
415-
}}
416-
413+
order, err := table.NewSortOrder(1, []table.SortField{
414+
{SourceID: 2, Transform: iceberg.IdentityTransform{}, NullOrder: table.NullsFirst, Direction: table.SortASC},
415+
})
416+
s.Require().NoError(err)
417417
tbl, err := tt.cat.CreateTable(context.Background(), tt.tblID, tableSchemaNested,
418418
catalog.WithSortOrder(order))
419419
s.Require().NoError(err)
420420

421421
s.FileExists(strings.TrimPrefix(tbl.MetadataLocation(), "file://"))
422-
s.Equal(1, tbl.SortOrder().OrderID)
423-
s.Len(tbl.SortOrder().Fields, 1)
424-
s.Equal(table.SortASC, tbl.SortOrder().Fields[0].Direction)
425-
s.Equal(table.NullsFirst, tbl.SortOrder().Fields[0].NullOrder)
426-
s.Equal("identity", tbl.SortOrder().Fields[0].Transform.String())
422+
s.Equal(1, tbl.SortOrder().OrderID())
423+
s.Equal(tbl.SortOrder().Len(), 1)
424+
425+
for f := range tbl.SortOrder().Fields() {
426+
s.Equal(table.SortASC, f.Direction)
427+
s.Equal(table.NullsFirst, f.NullOrder)
428+
s.Equal("identity", f.Transform.String())
429+
430+
break
431+
}
432+
427433
s.NoError(tt.cat.DropTable(context.Background(), tt.tblID))
428434
}
429435
}

cmd/iceberg/utils.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ func parseSortOrder(sortStr string) (table.SortOrder, error) {
138138
return table.UnsortedSortOrder, nil
139139
}
140140

141-
return table.SortOrder{
142-
OrderID: table.InitialSortOrderID,
143-
Fields: sortFields,
144-
}, nil
141+
return table.NewSortOrder(
142+
table.InitialSortOrderID,
143+
sortFields,
144+
)
145145
}

cmd/iceberg/utils_test.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"github.com/apache/iceberg-go"
2525
"github.com/apache/iceberg-go/table"
26+
"github.com/stretchr/testify/require"
2627
)
2728

2829
func TestParseProperties(t *testing.T) {
@@ -210,29 +211,32 @@ func TestParseSortOrder(t *testing.T) {
210211
}
211212
if !tt.isErr {
212213
// For an empty string, we expect UnsortedSortOrder with OrderID 0
214+
require.NoError(t, err)
213215
if tt.input == "" {
214-
if got.OrderID != 0 {
215-
t.Errorf("parseSortOrder() for empty string should return OrderID 0, got %d", got.OrderID)
216+
if got.OrderID() != 0 {
217+
t.Errorf("parseSortOrder() for empty string should return OrderID 0, got %d", got.OrderID())
216218
}
217-
} else if got.OrderID == 0 {
219+
} else if got.OrderID() == 0 {
218220
t.Errorf("parseSortOrder() returned invalid sort order for valid input")
219221
}
220222

221223
// Validate the number of fields
222-
if len(got.Fields) != tt.expectedFieldsCount {
223-
t.Errorf("parseSortOrder() returned %d fields, expected %d", len(got.Fields), tt.expectedFieldsCount)
224+
if got.Len() != tt.expectedFieldsCount {
225+
t.Errorf("parseSortOrder() returned %d fields, expected %d", got.Len(), tt.expectedFieldsCount)
224226

225227
return
226228
}
227229

228230
// Validate sort directions and null orders
229-
for i, field := range got.Fields {
231+
i := 0
232+
for field := range got.Fields() {
230233
if i < len(tt.expectedDirections) && field.Direction != tt.expectedDirections[i] {
231234
t.Errorf("parseSortOrder() field %d direction = %v, expected %v", i, field.Direction, tt.expectedDirections[i])
232235
}
233236
if i < len(tt.expectedNullOrders) && field.NullOrder != tt.expectedNullOrders[i] {
234237
t.Errorf("parseSortOrder() field %d null order = %v, expected %v", i, field.NullOrder, tt.expectedNullOrders[i])
235238
}
239+
i++
236240
}
237241
}
238242
})

table/arrow_utils_internal_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ func constructTestTable(t *testing.T, writeStats []string) (*metadata.FileMetaDa
8888
],
8989
"default-spec-id": 0,
9090
"partition-specs": [{"spec-id": 0, "fields": []}],
91-
"properties": {}
91+
"properties": {},
92+
"sort-orders": [{"order-id": 0, "fields": []}],
93+
"default-sort-order-id": 0
9294
}`)
9395
require.NoError(t, err)
9496

table/internal/parquet_files_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ func constructTestTablePrimitiveTypes(t *testing.T) (*metadata.FileMetaData, tab
7171
"last-partition-id": 0,
7272
"last-updated-ms": -1,
7373
"default-spec-id": 0,
74+
"default-sort-order-id": 0,
75+
"sort-orders": [{"order-id": 0, "fields": []}],
7476
"partition-specs": [{"spec-id": 0, "fields": []}],
7577
"properties": {}
7678
}`)

table/metadata.go

Lines changed: 77 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ type MetadataBuilder struct {
165165
// update tracking
166166
lastAddedSchemaID *int
167167
lastAddedPartitionID *int
168+
lastAddedSortOrderID *int
168169
}
169170

170171
func NewMetadataBuilder() (*MetadataBuilder, error) {
@@ -384,29 +385,37 @@ func (b *MetadataBuilder) RemoveSnapshots(snapshotIds []int64) error {
384385
return nil
385386
}
386387

387-
func (b *MetadataBuilder) AddSortOrder(sortOrder *SortOrder, initial bool) error {
388+
func (b *MetadataBuilder) AddSortOrder(sortOrder *SortOrder) error {
388389
curSchema := b.CurrentSchema()
389390
if curSchema == nil {
390391
return errors.New("can't add sort order with no current schema")
391392
}
392393

393-
if err := sortOrder.CheckCompatibility(curSchema); err != nil {
394-
return fmt.Errorf("sort order %s is not compatible with current schema: %w", sortOrder, err)
394+
newOrderID := b.reuseOrCreateNewSortOrderID(sortOrder)
395+
if _, err := b.GetSortOrderByID(newOrderID); err == nil {
396+
if b.lastAddedSortOrderID != &newOrderID {
397+
b.lastAddedSortOrderID = &newOrderID
398+
sortOrder.orderID = newOrderID
399+
b.updates = append(b.updates, NewAddSortOrderUpdate(sortOrder))
400+
}
401+
402+
return nil
395403
}
404+
sortOrder.orderID = newOrderID
396405

397-
var sortOrders []SortOrder
398-
if !initial {
399-
sortOrders = append(sortOrders, b.sortOrderList...)
406+
sortOrders := b.sortOrderList
407+
if err := sortOrder.CheckCompatibility(curSchema); err != nil {
408+
return fmt.Errorf("sort order %s is not compatible with current schema: %w", sortOrder, err)
400409
}
401410

402411
for _, s := range sortOrders {
403-
if s.OrderID == sortOrder.OrderID {
404-
return fmt.Errorf("sort order with id %d already exists", sortOrder.OrderID)
412+
if s.OrderID() == sortOrder.OrderID() {
413+
return fmt.Errorf("sort order with id %d already exists", sortOrder.orderID)
405414
}
406415
}
407416

408417
b.sortOrderList = append(sortOrders, *sortOrder)
409-
b.updates = append(b.updates, NewAddSortOrderUpdate(sortOrder, initial))
418+
b.updates = append(b.updates, NewAddSortOrderUpdate(sortOrder))
410419

411420
return nil
412421
}
@@ -459,10 +468,10 @@ func (b *MetadataBuilder) SetCurrentSchemaID(currentSchemaID int) error {
459468
func (b *MetadataBuilder) SetDefaultSortOrderID(defaultSortOrderID int) error {
460469
if defaultSortOrderID == -1 {
461470
defaultSortOrderID = maxBy(b.sortOrderList, func(s SortOrder) int {
462-
return s.OrderID
471+
return s.OrderID()
463472
})
464473
if !slices.ContainsFunc(b.updates, func(u Update) bool {
465-
return u.Action() == UpdateAddSortOrder && u.(*addSortOrderUpdate).SortOrder.OrderID == defaultSortOrderID
474+
return u.Action() == UpdateAddSortOrder && u.(*addSortOrderUpdate).SortOrder.OrderID() == defaultSortOrderID
466475
}) {
467476
return errors.New("can't set default sort order to last added with no added sort orders")
468477
}
@@ -752,7 +761,7 @@ func (b *MetadataBuilder) GetSpecByID(id int) (*iceberg.PartitionSpec, error) {
752761

753762
func (b *MetadataBuilder) GetSortOrderByID(id int) (*SortOrder, error) {
754763
for _, s := range b.sortOrderList {
755-
if s.OrderID == id {
764+
if s.OrderID() == id {
756765
return &s, nil
757766
}
758767
}
@@ -847,6 +856,24 @@ func (b *MetadataBuilder) Build() (Metadata, error) {
847856
}
848857
}
849858

859+
func (b *MetadataBuilder) reuseOrCreateNewSortOrderID(newOrder *SortOrder) int {
860+
if newOrder.IsUnsorted() {
861+
return UnsortedSortOrder.OrderID()
862+
}
863+
864+
newOrderID := UnsortedSortOrderID + 1
865+
for _, order := range b.sortOrderList {
866+
if slices.Equal(order.fields, newOrder.fields) {
867+
return order.OrderID()
868+
}
869+
if order.OrderID() >= newOrderID {
870+
newOrderID = order.OrderID() + 1
871+
}
872+
}
873+
874+
return newOrderID
875+
}
876+
850877
func (b *MetadataBuilder) reuseOrCreateNewPartitionSpecID(newSpec iceberg.PartitionSpec) int {
851878
newSpecID := 0
852879
for _, spec := range b.specs {
@@ -1012,10 +1039,13 @@ type commonMetadata struct {
10121039

10131040
func initCommonMetadataForDeserialization() commonMetadata {
10141041
return commonMetadata{
1015-
LastUpdatedMS: -1,
1016-
LastColumnId: -1,
1017-
CurrentSchemaID: -1,
1018-
DefaultSpecID: -1,
1042+
LastUpdatedMS: -1,
1043+
LastColumnId: -1,
1044+
CurrentSchemaID: -1,
1045+
DefaultSpecID: -1,
1046+
DefaultSortOrderID: -1,
1047+
SortOrderList: nil,
1048+
Specs: nil,
10191049
}
10201050
}
10211051

@@ -1135,7 +1165,7 @@ func (c *commonMetadata) CurrentSnapshot() *Snapshot {
11351165
func (c *commonMetadata) SortOrders() []SortOrder { return c.SortOrderList }
11361166
func (c *commonMetadata) SortOrder() SortOrder {
11371167
for _, s := range c.SortOrderList {
1138-
if s.OrderID == c.DefaultSortOrderID {
1168+
if s.OrderID() == c.DefaultSortOrderID {
11391169
return s
11401170
}
11411171
}
@@ -1217,14 +1247,14 @@ func (c *commonMetadata) checkSortOrders() error {
12171247
}
12181248

12191249
for _, o := range c.SortOrderList {
1220-
if o.OrderID == c.DefaultSortOrderID {
1250+
if o.OrderID() == c.DefaultSortOrderID {
12211251
if err := o.CheckCompatibility(c.CurrentSchema()); err != nil {
1222-
return fmt.Errorf("default sort order %d is not compatible with current schema: %w", o.OrderID, err)
1252+
return fmt.Errorf("default sort order %d is not compatible with current schema: %w", o.OrderID(), err)
12231253
}
12241254

12251255
return nil
12261256
}
1227-
if o.OrderID == UnsortedSortOrderID && len(o.Fields) != 0 {
1257+
if o.OrderID() == UnsortedSortOrderID && o.Len() != 0 {
12281258
return fmt.Errorf("sort order ID %d is reserved for unsorted order", UnsortedSortOrderID)
12291259
}
12301260
}
@@ -1246,20 +1276,6 @@ func (c *commonMetadata) constructRefs() {
12461276
}
12471277

12481278
func (c *commonMetadata) validate() error {
1249-
if err := c.checkSchemas(); err != nil {
1250-
return err
1251-
}
1252-
1253-
if err := c.checkPartitionSpecs(); err != nil {
1254-
return err
1255-
}
1256-
1257-
if err := c.checkSortOrders(); err != nil {
1258-
return err
1259-
}
1260-
1261-
c.constructRefs()
1262-
12631279
switch {
12641280
case c.LastUpdatedMS == 0:
12651281
// last-updated-ms is required
@@ -1269,12 +1285,34 @@ func (c *commonMetadata) validate() error {
12691285
return fmt.Errorf("%w: missing last-column-id", ErrInvalidMetadata)
12701286
case c.CurrentSchemaID < 0:
12711287
return fmt.Errorf("%w: no valid schema configuration found in table metadata", ErrInvalidMetadata)
1288+
case c.SortOrderList == nil && c.FormatVersion > 1:
1289+
return fmt.Errorf("%w: missing sort-orders", ErrInvalidMetadata)
1290+
case c.Specs == nil && c.FormatVersion > 1:
1291+
return fmt.Errorf("%w: missing partition-specs", ErrInvalidMetadata)
1292+
case c.DefaultSortOrderID < 0 && c.FormatVersion > 1:
1293+
return fmt.Errorf("%w: default-sort-order-id must be set for FormatVersion > 1", ErrInvalidMetadata)
1294+
case c.DefaultPartitionSpec() < 0 && c.FormatVersion > 1:
1295+
return fmt.Errorf("%w: default-partition-spec-id must be set for FormatVersion > 1", ErrInvalidMetadata)
12721296
case c.LastPartitionID == nil:
12731297
if c.FormatVersion > 1 {
12741298
return fmt.Errorf("%w: last-partition-id must be set for FormatVersion > 1", ErrInvalidMetadata)
12751299
}
12761300
}
12771301

1302+
if err := c.checkSchemas(); err != nil {
1303+
return err
1304+
}
1305+
1306+
if err := c.checkPartitionSpecs(); err != nil {
1307+
return err
1308+
}
1309+
1310+
if err := c.checkSortOrders(); err != nil {
1311+
return err
1312+
}
1313+
1314+
c.constructRefs()
1315+
12781316
return nil
12791317
}
12801318

@@ -1299,9 +1337,12 @@ type metadataV1 struct {
12991337
}
13001338

13011339
func initMetadataV1Deser() *metadataV1 {
1302-
return &metadataV1{
1340+
meta := metadataV1{
13031341
commonMetadata: initCommonMetadataForDeserialization(),
13041342
}
1343+
meta.commonMetadata.DefaultSortOrderID = 0
1344+
1345+
return &meta
13051346
}
13061347

13071348
func (m *metadataV1) LastSequenceNumber() int64 { return 0 }
@@ -1488,7 +1529,7 @@ func NewMetadataWithUUID(sc *iceberg.Schema, partitions *iceberg.PartitionSpec,
14881529
return nil, err
14891530
}
14901531

1491-
if err = builder.AddSortOrder(&reassignedIds.sortOrder, true); err != nil {
1532+
if err = builder.AddSortOrder(&reassignedIds.sortOrder); err != nil {
14921533
return nil, err
14931534
}
14941535

0 commit comments

Comments
 (0)