Merkle db Make Paths only refer to lists of nodes (#2143)

Signed-off-by: David Boehm <91908103+dboehm-avalabs@users.noreply.github.com> Co-authored-by: Darioush Jalali <darioush.jalali@avalabs.org>
ava-labs · Oct 19, 2023 · a9c260b · a9c260b
1 parent 0faab95
commit a9c260b
Show file tree

Hide file tree

Showing 27 changed files with 863 additions and 859 deletions.
diff --git a/proto/pb/sync/sync.pb.go b/proto/pb/sync/sync.pb.go
diff --git a/proto/sync/sync.proto b/proto/sync/sync.proto
@@ -139,7 +139,7 @@ message RangeProof {
 }
 
 message ProofNode {
-  Path key = 1;
+  Key key = 1;
   MaybeBytes value_or_hash = 2;
   map<uint32, bytes> children = 3;
 }
@@ -149,7 +149,7 @@ message KeyChange {
   MaybeBytes value = 2;
 }
 
-message Path {
+message Key {
   uint64 length = 1;
   bytes value = 2;
 }

diff --git a/x/merkledb/README.md b/x/merkledb/README.md
@@ -21,9 +21,9 @@ To reduce the depth of nodes in the trie, a `Merkle Node` utilizes path compress
 | Merkle Node                       |
 |                                   |
 | ID: 0x0131                        |  an id representing the current node, derived from the node's value and all children ids
-| Key: 0x91                         |  prefix of the key path, representing the location of the node in the trie
-| Value: 0x00                       |  the value, if one exists, that is stored at the key path (pathPrefix + compressedPath)
-| Children:                         |  a map of children node ids for any nodes in the trie that have this node's key path as a prefix
+| Key: 0x91                         |  prefix of the key, representing the location of the node in the trie
+| Value: 0x00                       |  the value, if one exists, that is stored at the key (keyPrefix + compressedKey)
+| Children:                         |  a map of children node ids for any nodes in the trie that have this node's key as a prefix
 |   0: [:0x00542F]                  |  child 0 represents a node with key 0x910 with ID 0x00542F
 |   1: [0x432:0xA0561C]             |  child 1 represents a node with key 0x911432 with ID 0xA0561C
 |   ...                             |
@@ -52,19 +52,19 @@ The node serialization format is as follows:
 +----------------------------------------------------+
 | Child index (varint)                               |
 +----------------------------------------------------+
-| Child compressed path length (varint)              |
+| Child compressed key length (varint)              |
 +----------------------------------------------------+
-| Child compressed path (variable length bytes)      |
+| Child compressed key (variable length bytes)      |
 +----------------------------------------------------+
 | Child ID (32 bytes)                                |
 +----------------------------------------------------+
 | Child has value (1 bytes)                          |
 +----------------------------------------------------+
 | Child index (varint)                               |
 +----------------------------------------------------+
-| Child compressed path length (varint)              |
+| Child compressed key length (varint)              |
 +----------------------------------------------------+
-| Child compressed path (variable length bytes)      |
+| Child compressed key (variable length bytes)      |
 +----------------------------------------------------+
 | Child ID (32 bytes)                                |
 +----------------------------------------------------+
@@ -80,8 +80,8 @@ Where:
 * `Value` is the value, if it exists (i.e. if `Value existince flag` is `1`.) Otherwise not serialized.
 * `Number of children` is the number of children this node has.
 * `Child index` is the index of a child node within the list of the node's children.
-* `Child compressed path length` is the length of the child node's compressed path.
-* `Child compressed path` is the child node's compressed path.
+* `Child compressed key length` is the length of the child node's compressed key.
+* `Child compressed key` is the child node's compressed key.
 * `Child ID` is the child node's ID.
 * `Child has value` indicates if that child has a value.
 
@@ -91,9 +91,9 @@ For each child of the node, we have an additional:
 +----------------------------------------------------+
 | Child index (varint)                               |
 +----------------------------------------------------+
-| Child compressed path length (varint)              |
+| Child compressed key length (varint)              |
 +----------------------------------------------------+
-| Child compressed path (variable length bytes)      |
+| Child compressed key (variable length bytes)      |
 +----------------------------------------------------+
 | Child ID (32 bytes)                                |
 +----------------------------------------------------+
@@ -114,8 +114,8 @@ Its byte representation (in hex) is: `0x01020204000210579EB3718A7E437D2DDCE931AC
 
 The node's key is empty (its the root) and has value `0x02`.
 It has two children.
-The first is at child index `0`, has compressed path `0x01` and ID (in hex) `0x579eb3718a7e437d2ddce931ac7cc05a0bc695a9c2084f5df12fb96ad0fa3266`.
-The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex) `0x9845893c4f9d92c4e097fcf2589bc9d6882b1f18d1c2fc91d7df1d3fcbdb4238`.
+The first is at child index `0`, has compressed key `0x01` and ID (in hex) `0x579eb3718a7e437d2ddce931ac7cc05a0bc695a9c2084f5df12fb96ad0fa3266`.
+The second is at child index `14`, has compressed key `0x0F0F0F` and ID (in hex) `0x9845893c4f9d92c4e097fcf2589bc9d6882b1f18d1c2fc91d7df1d3fcbdb4238`.
 
 ```
 +--------------------------------------------------------------------+
@@ -134,10 +134,10 @@ The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex
 | Child index (varint)                                               |
 | 0x00                                                               |
 +--------------------------------------------------------------------+
-| Child compressed path length (varint)                              |
+| Child compressed key length (varint)                              |
 | 0x02                                                               |
 +--------------------------------------------------------------------+
-| Child compressed path (variable length bytes)                      |
+| Child compressed key (variable length bytes)                      |
 | 0x10                                                               |
 +--------------------------------------------------------------------+
 | Child ID (32 bytes)                                                |
@@ -146,10 +146,10 @@ The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex
 | Child index (varint)                                               |
 | 0x0E                                                               |
 +--------------------------------------------------------------------+
-| Child compressed path length (varint)                              |
+| Child compressed key length (varint)                              |
 | 0x06                                                               |
 +--------------------------------------------------------------------+
-| Child compressed path (variable length bytes)                      |
+| Child compressed key (variable length bytes)                      |
 | 0xFFF0                                                             |
 +--------------------------------------------------------------------+
 | Child ID (32 bytes)                                                |
@@ -204,7 +204,7 @@ Where:
 
 Note that, as with the node serialization format, the `Child index` values aren't necessarily sequential, but they are unique and strictly increasing.
 Also like the node serialization format, there can be up to 16 blocks of children data.
-However, note that child compressed paths are not included in the node ID calculation.
+However, note that child compressed keys are not included in the node ID calculation.
 
 Once this is encoded, we `sha256` hash the resulting bytes to get the node's ID.
 
@@ -227,7 +227,7 @@ By splitting the nodes up by value, it allows better key/value iteration and a m
 
 ### Single node type
 
-A `Merkle Node` holds the IDs of its children, its value, as well as any path extension. This simplifies some logic and allows all of the data about a node to be loaded in a single database read. This trades off a small amount of storage efficiency (some fields may be `nil` but are still stored for every node).
+A `Merkle Node` holds the IDs of its children, its value, as well as any key extension. This simplifies some logic and allows all of the data about a node to be loaded in a single database read. This trades off a small amount of storage efficiency (some fields may be `nil` but are still stored for every node).
 
 ### Validity
 

diff --git a/x/merkledb/codec.go b/x/merkledb/codec.go
@@ -21,16 +21,16 @@ const (
 	falseByte            = 0
 	minVarIntLen         = 1
 	minMaybeByteSliceLen = boolLen
-	minPathLen           = minVarIntLen
+	minKeyLen            = minVarIntLen
 	minByteSliceLen      = minVarIntLen
 	minDBNodeLen         = minMaybeByteSliceLen + minVarIntLen
-	minChildLen          = minVarIntLen + minPathLen + ids.IDLen + boolLen
+	minChildLen          = minVarIntLen + minKeyLen + ids.IDLen + boolLen
 
-	estimatedKeyLen            = 64
-	estimatedValueLen          = 64
-	estimatedCompressedPathLen = 8
-	// Child index, child compressed path, child ID, child has value
-	estimatedNodeChildLen = minVarIntLen + estimatedCompressedPathLen + ids.IDLen + boolLen
+	estimatedKeyLen           = 64
+	estimatedValueLen         = 64
+	estimatedCompressedKeyLen = 8
+	// Child index, child compressed key, child ID, child has value
+	estimatedNodeChildLen = minVarIntLen + estimatedCompressedKeyLen + ids.IDLen + boolLen
 	// Child index, child ID
 	hashValuesChildLen = minVarIntLen + ids.IDLen
 )
@@ -45,7 +45,7 @@ var (
 	errChildIndexTooLarge = errors.New("invalid child index. Must be less than branching factor")
 	errLeadingZeroes      = errors.New("varint has leading zeroes")
 	errInvalidBool        = errors.New("decoded bool is neither true nor false")
-	errNonZeroPathPadding = errors.New("path partial byte should be padded with 0s")
+	errNonZeroKeyPadding  = errors.New("key partial byte should be padded with 0s")
 	errExtraSpace         = errors.New("trailing buffer space")
 	errIntOverflow        = errors.New("value overflows int")
 )
@@ -102,7 +102,7 @@ func (c *codecImpl) encodeDBNode(n *dbNode, branchFactor BranchFactor) []byte {
 	for index := 0; BranchFactor(index) < branchFactor; index++ {
 		if entry, ok := n.children[byte(index)]; ok {
 			c.encodeUint(buf, uint64(index))
-			c.encodePath(buf, entry.compressedPath)
+			c.encodeKey(buf, entry.compressedKey)
 			_, _ = buf.Write(entry.id[:])
 			c.encodeBool(buf, entry.hasValue)
 		}
@@ -128,7 +128,7 @@ func (c *codecImpl) encodeHashValues(hv *hashValues) []byte {
 		}
 	}
 	c.encodeMaybeByteSlice(buf, hv.Value)
-	c.encodePath(buf, hv.Key)
+	c.encodeKey(buf, hv.Key)
 
 	return buf.Bytes()
 }
@@ -168,7 +168,7 @@ func (c *codecImpl) decodeDBNode(b []byte, n *dbNode, branchFactor BranchFactor)
 		}
 		previousChild = index
 
-		compressedPath, err := c.decodePath(src, branchFactor)
+		compressedKey, err := c.decodeKey(src, branchFactor)
 		if err != nil {
 			return err
 		}
@@ -181,9 +181,9 @@ func (c *codecImpl) decodeDBNode(b []byte, n *dbNode, branchFactor BranchFactor)
 			return err
 		}
 		n.children[byte(index)] = child{
-			compressedPath: compressedPath,
-			id:             childID,
-			hasValue:       hasValue,
+			compressedKey: compressedKey,
+			id:            childID,
+			hasValue:      hasValue,
 		}
 	}
 	if src.Len() != 0 {
@@ -326,43 +326,43 @@ func (*codecImpl) decodeID(src *bytes.Reader) (ids.ID, error) {
 	return id, err
 }
 
-func (c *codecImpl) encodePath(dst *bytes.Buffer, p Path) {
-	c.encodeUint(dst, uint64(p.tokensLength))
-	_, _ = dst.Write(p.Bytes())
+func (c *codecImpl) encodeKey(dst *bytes.Buffer, key Key) {
+	c.encodeUint(dst, uint64(key.tokenLength))
+	_, _ = dst.Write(key.Bytes())
 }
 
-func (c *codecImpl) decodePath(src *bytes.Reader, branchFactor BranchFactor) (Path, error) {
-	if minPathLen > src.Len() {
-		return Path{}, io.ErrUnexpectedEOF
+func (c *codecImpl) decodeKey(src *bytes.Reader, branchFactor BranchFactor) (Key, error) {
+	if minKeyLen > src.Len() {
+		return Key{}, io.ErrUnexpectedEOF
 	}
 
 	length, err := c.decodeUint(src)
 	if err != nil {
-		return Path{}, err
+		return Key{}, err
 	}
 	if length > math.MaxInt {
-		return Path{}, errIntOverflow
+		return Key{}, errIntOverflow
 	}
-	result := emptyPath(branchFactor)
-	result.tokensLength = int(length)
-	pathBytesLen := result.bytesNeeded(result.tokensLength)
-	if pathBytesLen > src.Len() {
-		return Path{}, io.ErrUnexpectedEOF
+	result := emptyKey(branchFactor)
+	result.tokenLength = int(length)
+	keyBytesLen := result.bytesNeeded(result.tokenLength)
+	if keyBytesLen > src.Len() {
+		return Key{}, io.ErrUnexpectedEOF
 	}
-	buffer := make([]byte, pathBytesLen)
+	buffer := make([]byte, keyBytesLen)
 	if _, err := io.ReadFull(src, buffer); err != nil {
 		if err == io.EOF {
 			err = io.ErrUnexpectedEOF
 		}
-		return Path{}, err
+		return Key{}, err
 	}
 	if result.hasPartialByte() {
 		// Confirm that the padding bits in the partial byte are 0.
 		// We want to only look at the bits to the right of the last token, which is at index length-1.
 		// Generate a mask with (8-bitsToShift) 0s followed by bitsToShift 1s.
-		paddingMask := byte(0xFF >> (8 - result.bitsToShift(result.tokensLength-1)))
-		if buffer[pathBytesLen-1]&paddingMask != 0 {
-			return Path{}, errNonZeroPathPadding
+		paddingMask := byte(0xFF >> (8 - result.bitsToShift(result.tokenLength-1)))
+		if buffer[keyBytesLen-1]&paddingMask != 0 {
+			return Key{}, errNonZeroKeyPadding
 		}
 	}
 	result.value = string(buffer)

diff --git a/x/merkledb/codec_test.go b/x/merkledb/codec_test.go
@@ -73,7 +73,7 @@ func FuzzCodecInt(f *testing.F) {
 	)
 }
 
-func FuzzCodecPath(f *testing.F) {
+func FuzzCodecKey(f *testing.F) {
 	f.Fuzz(
 		func(
 			t *testing.T,
@@ -84,7 +84,7 @@ func FuzzCodecPath(f *testing.F) {
 				codec := codec.(*codecImpl)
 				reader := bytes.NewReader(b)
 				startLen := reader.Len()
-				got, err := codec.decodePath(reader, branchFactor)
+				got, err := codec.decodeKey(reader, branchFactor)
 				if err != nil {
 					t.SkipNow()
 				}
@@ -93,7 +93,7 @@ func FuzzCodecPath(f *testing.F) {
 
 				// Encoding [got] should be the same as [b].
 				var buf bytes.Buffer
-				codec.encodePath(&buf, got)
+				codec.encodeKey(&buf, got)
 				bufBytes := buf.Bytes()
 				require.Len(bufBytes, numRead)
 				require.Equal(b[:numRead], bufBytes)
@@ -155,12 +155,12 @@ func FuzzCodecDBNodeDeterministic(f *testing.F) {
 					var childID ids.ID
 					_, _ = r.Read(childID[:]) // #nosec G404
 
-					childPathBytes := make([]byte, r.Intn(32)) // #nosec G404
-					_, _ = r.Read(childPathBytes)              // #nosec G404
+					childKeyBytes := make([]byte, r.Intn(32)) // #nosec G404
+					_, _ = r.Read(childKeyBytes)              // #nosec G404
 
 					children[byte(i)] = child{
-						compressedPath: NewPath(childPathBytes, branchFactor),
-						id:             childID,
+						compressedKey: ToKey(childKeyBytes, branchFactor),
+						id:            childID,
 					}
 				}
 				node := dbNode{
@@ -225,14 +225,14 @@ func FuzzEncodeHashValues(f *testing.F) {
 				children := map[byte]child{}
 				numChildren := r.Intn(int(branchFactor)) // #nosec G404
 				for i := 0; i < numChildren; i++ {
-					compressedPathLen := r.Intn(32) // #nosec G404
-					compressedPathBytes := make([]byte, compressedPathLen)
-					_, _ = r.Read(compressedPathBytes) // #nosec G404
+					compressedKeyLen := r.Intn(32) // #nosec G404
+					compressedKeyBytes := make([]byte, compressedKeyLen)
+					_, _ = r.Read(compressedKeyBytes) // #nosec G404
 
 					children[byte(i)] = child{
-						compressedPath: NewPath(compressedPathBytes, branchFactor),
-						id:             ids.GenerateTestID(),
-						hasValue:       r.Intn(2) == 1, // #nosec G404
+						compressedKey: ToKey(compressedKeyBytes, branchFactor),
+						id:            ids.GenerateTestID(),
+						hasValue:      r.Intn(2) == 1, // #nosec G404
 					}
 				}
 
@@ -250,7 +250,7 @@ func FuzzEncodeHashValues(f *testing.F) {
 				hv := &hashValues{
 					Children: children,
 					Value:    value,
-					Key:      NewPath(key, branchFactor),
+					Key:      ToKey(key, branchFactor),
 				}
 
 				// Serialize the *hashValues with both codecs
@@ -264,9 +264,9 @@ func FuzzEncodeHashValues(f *testing.F) {
 	)
 }
 
-func TestCodecDecodePathLengthOverflowRegression(t *testing.T) {
+func TestCodecDecodeKeyLengthOverflowRegression(t *testing.T) {
 	codec := codec.(*codecImpl)
 	bytes := bytes.NewReader(binary.AppendUvarint(nil, math.MaxInt))
-	_, err := codec.decodePath(bytes, BranchFactor16)
+	_, err := codec.decodeKey(bytes, BranchFactor16)
 	require.ErrorIs(t, err, io.ErrUnexpectedEOF)
 }