Skip to content

Commit

Permalink
Make LineMerger read its input files in deterministic order
Browse files Browse the repository at this point in the history
Before this change, it was not clearly defined which input stream
is put at the top of the min-heap if multiple streams happen to
contain the same line. This would not be a problem for correctness,
but the non-deterministic processing ordering makes debugging difficult
in presence of I/O problems.

#40
  • Loading branch information
brawer committed May 18, 2024
1 parent e3c3d6e commit a61c068
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion cmd/qrank-builder/linemerger.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"bytes"
"container/heap"
"fmt"
"strings"
)

// Merges the lines of a multiple io.Readers whose content is in sorted order.
Expand Down Expand Up @@ -99,7 +100,25 @@ type lineMergerHeap []*mergee
func (h lineMergerHeap) Len() int { return len(h) }

func (h lineMergerHeap) Less(i, j int) bool {
return bytes.Compare(h[i].scanner.Bytes(), h[j].scanner.Bytes()) < 0
if c := bytes.Compare(h[i].scanner.Bytes(), h[j].scanner.Bytes()); c < 0 {
return true
} else if c > 0 {
return false
}

// Make the processing order deterministic by imposing a total order.
// https://github.com/brawer/wikidata-qrank/issues/40#issuecomment-2118675361
if c := strings.Compare(h[i].name, h[j].name); c < 0 {
return true
} else if c > 0 {
return false
}

// This should not happen in production.
msg := fmt.Sprintf("LineMergerHeap.Less() called on equivalent items; i=%d, h[i]=%v, j=%d, h[j]=%v", i, h[i], j, h[j])
logger.Println(msg)
panic(msg)
return false
}

func (h lineMergerHeap) Swap(i, j int) {
Expand Down

0 comments on commit a61c068

Please sign in to comment.