-
Notifications
You must be signed in to change notification settings - Fork 67
/
dagsplitter.go
223 lines (188 loc) · 5.82 KB
/
dagsplitter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
//go:generate go run ./gen
package dagspliter
import (
"context"
"fmt"
"os"
"github.com/docker/go-units"
"github.com/ipfs/go-cid"
ipld "github.com/ipfs/go-ipld-format"
mdag "github.com/ipfs/go-merkledag"
"github.com/ipfs/go-unixfs"
uio "github.com/ipfs/go-unixfs/io"
"golang.org/x/xerrors"
)
/// Box is a way of packing together *partial* DAGs to achieve a certain size
/// while generating the associated CAR file containing them. It is an
/// alternative to actual re-chunking the DAG nodes which can be expensive for
/// very large DAGs. A *partial* DAG is generated by excluding certain sub-DAGs
/// from it.
type Box struct {
/// CID of the roots of the *partial* DAGs contained in this Box.
Roots []cid.Cid
/// CIDs of the roots of the sub-DAGs excluded from the original DAGs
/// (delimited by Roots). We don't keep track of which sub-DAG is being
/// trimmed from which full DAG in Roots, so to obtain the *partial* DAGs
/// one needs to walk each DAG in Roots checking if any of its links
/// are contained here.
External []cid.Cid
}
type Builder struct {
// Service to fetch the nodes in the DAGs and query its links.
dagService ipld.DAGService
// Maximum size allowed for each generated Box.
boxMaxSize uint64
// Minimum size of graph chunks to bother packing into boxes
minSubgraphSize uint64
// Generated boxes when packing a DAG.
boxes []*Box
// Used size of the current box we are packing (last one in the list). Since
// we only pack one box at a time and don't come back to a box once we're
// done with it we just track a single value here and not in each box.
boxUsedSize uint64
}
func NewBuilder(dserv ipld.DAGService, chunksize uint64, minSubgraphSize uint64) *Builder {
bb := &Builder{
dagService: dserv,
boxMaxSize: chunksize,
minSubgraphSize: minSubgraphSize,
boxes: make([]*Box, 0),
}
bb.newBox()
return bb
}
func getSingleNodeSize(node ipld.Node) uint64 {
// FIXME: How to check the size of the parent node without taking into
// account the children? The Node interface doesn't seem to account for
// that so we are going directly to the Block interface for now.
// We can probably get away with not accounting non-file data well, and
// just have some % overhead when accounting space (obviously that will
// break horribly with small files, but it should be good enough in the
// average case).
return uint64(len(node.RawData()))
}
func (b *Builder) Boxes() []*Box {
return b.boxes
}
// TODO: handle non-protobuf dags
func (b *Builder) getTreeSize(nd ipld.Node) (uint64, error) {
switch n := nd.(type) {
case *mdag.RawNode:
return uint64(len(n.RawData())), nil
case *mdag.ProtoNode:
fsNode, err := unixfs.FSNodeFromBytes(n.Data())
if err != nil {
return 0, xerrors.Errorf("loading unixfs node: %w", err)
}
switch fsNode.Type() {
case unixfs.TFile, unixfs.TRaw, unixfs.TDirectory, unixfs.THAMTShard:
return n.Size()
case unixfs.TMetadata:
/*if len(n.Links()) == 0 {
return nil, xerrors.New("incorrectly formatted metadata object")
}
child, err := n.Links()[0].GetNode(ctx, b.dagService)
if err != nil {
return nil, err
}
childpb, ok := child.(*mdag.ProtoNode)
if !ok {
return nil, mdag.ErrNotProtobuf
}*/
return 0, xerrors.Errorf("metadata object support todo")
case unixfs.TSymlink:
return 0, xerrors.Errorf("symlink object support todo")
default:
return 0, unixfs.ErrUnrecognizedType
}
default:
return 0, uio.ErrUnkownNodeType
}
}
func (b *Builder) Pack(ctx context.Context, root cid.Cid) error {
stack := []cid.Cid{root}
packed := cid.NewSet()
for len(stack) > 0 {
cur := stack[len(stack)-1]
stack = stack[:len(stack)-1]
if packed.Has(cur) {
// already packed this one in, skip it
continue
}
nd, err := b.dagService.Get(ctx, cur)
if err != nil {
return err
}
size, err := b.getTreeSize(nd)
if err != nil {
return err
}
if b.fits(uint64(size)) {
packed.Add(cur)
b.packRoot(cur)
b.addSize(uint64(size))
continue
} else if b.fits(uint64(len(nd.RawData()))) {
// this tree doesnt fit in the box, so lets add the node as 'raw' and recurse
// TODO: check if its a good candidate for going into its own new box
packed.Add(cur)
pref := cur.Prefix()
pref.Codec = cid.Raw
pref.Version = 1
ncid, err := pref.Sum(nd.RawData())
if err != nil {
return err
}
b.addExternalLink(ncid)
b.addSize(uint64(len(nd.RawData())))
// TODO: this means we traverse them in reverse order, not sure if thats the best way forward
for _, l := range nd.Links() {
stack = append(stack, l.Cid)
}
} else {
// need a new box, throw this one back on the stack and move on
stack = append(stack, cur)
b.newBox()
}
}
return nil
}
// Get current box we are packing into. By definition now this is always the
// last created box.
func (b *Builder) boxID() int {
return len(b.boxes) - 1
}
// Get current box we are packing into.
func (b *Builder) box() *Box {
return b.boxes[b.boxID()]
}
func (b *Builder) newBox() {
b.boxes = append(b.boxes, new(Box))
b.boxUsedSize = 0
}
// Remaining size in the current box.
func (b *Builder) boxRemainingSize() int64 {
return int64(b.boxMaxSize) - int64(b.used())
}
func (b *Builder) used() uint64 {
return b.boxUsedSize
}
func (b *Builder) print(msg string) {
status := fmt.Sprintf("[BOX %d] <%s>:",
b.boxID(), units.BytesSize(float64(b.used())))
fmt.Fprintf(os.Stderr, "%s %s\n", status, msg)
}
// Check this size fits in the current box.
func (b *Builder) fits(size uint64) bool {
return int64(size) <= b.boxRemainingSize()
}
func (b *Builder) addSize(size uint64) {
// FIXME: Maybe assert size (`fits`).
b.boxUsedSize += size
}
func (b *Builder) packRoot(c cid.Cid) {
b.box().Roots = append(b.box().Roots, c)
}
func (b *Builder) addExternalLink(node cid.Cid) {
b.box().External = append(b.box().External, node)
}