-
Notifications
You must be signed in to change notification settings - Fork 0
/
htree.go
216 lines (192 loc) · 5.66 KB
/
htree.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
// Package htree is a collection of tools for working with trees of html.Nodes.
package htree
import (
"bytes"
"io"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// Find finds the first node,
// in a depth-first search of the tree rooted at `node`,
// satisfying the given predicate.
func Find(node *html.Node, pred func(*html.Node) bool) *html.Node {
if pred(node) {
return node
}
if node.Type == html.TextNode {
return nil
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
if found := Find(child, pred); found != nil {
return found
}
}
return nil
}
// FindEl finds the first `ElementNode`-typed node,
// in a depth-first search of the tree rooted at `node`,
// satisfying the given predicate.
func FindEl(node *html.Node, pred func(*html.Node) bool) *html.Node {
return Find(node, elPred(pred))
}
// Walk applies f to each node in a recursive, preorder, depth-first walk of `node`.
// If any call to f produces an error, the walk is aborted and the error returned.
func Walk(node *html.Node, f func(*html.Node) error) error {
err := f(node)
if err != nil {
return err
}
if node.Type == html.TextNode {
return nil
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
err = Walk(child, f)
if err != nil {
return err
}
}
return nil
}
// FindAll walks the tree rooted at `node` in preorder, depth-first fashion.
// It tests each node in the tree with `pred`.
// Any node that passes the test causes FindAll to
// (a) call `f` on the node, and
// (b) skip walking the node's subtree.
//
// If any call to `f` returns an error, FindAll aborts the walk and returns the error.
//
// To continue walking the subtree of a node `n` that passes `pred`,
// call FindAllChildren(n, pred, f) in the body of `f`.
func FindAll(node *html.Node, pred func(*html.Node) bool, f func(*html.Node) error) error {
if pred(node) {
return f(node)
}
return FindAllChildren(node, pred, f)
}
// FindAllChildren is the same as FindAll but operates only on the children of `node`, not `node` itself.
func FindAllChildren(node *html.Node, pred func(*html.Node) bool, f func(*html.Node) error) error {
if node.Type == html.TextNode {
return nil
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
err := FindAll(child, pred, f)
if err != nil {
return err
}
}
return nil
}
// FindAllEls is like FindAll but calls `pred`, and perhaps `f`,
// only for nodes with type `ElementNode`.
//
// To continue walking the subtree of a node `n` that passes `pred`,
// call FindAllChildEls(n, pred, f) in the body of `f`.
func FindAllEls(node *html.Node, pred func(*html.Node) bool, f func(*html.Node) error) error {
return FindAll(node, elPred(pred), f)
}
// FindAllChildEls is the same as FindAllEls but operates only on the children of `node`, not `node` itself.
func FindAllChildEls(node *html.Node, pred func(*html.Node) bool, f func(*html.Node) error) error {
return FindAllChildren(node, elPred(pred), f)
}
// elPred takes a predicate function of a node and returns a new predicate
// that is true only if the node has type `ElementNode` and passes the original predicate.
func elPred(pred func(*html.Node) bool) func(*html.Node) bool {
return func(n *html.Node) bool {
return n.Type == html.ElementNode && pred(n)
}
}
// ElAttr returns `node`'s value for the attribute `key`.
func ElAttr(node *html.Node, key string) string {
for _, attr := range node.Attr {
if attr.Key == key {
return attr.Val
}
}
return ""
}
// ElClassContains tells whether `node` has a `class` attribute
// containing the class name `probe`.
func ElClassContains(node *html.Node, probe string) bool {
classes := strings.Fields(ElAttr(node, "class"))
for _, c := range classes {
if c == probe {
return true
}
}
return false
}
// WriteText converts the content of the tree rooted at `node` into plain text
// and writes it to `w`.
// HTML entities are decoded,
// <script> and <style> nodes are pruned,
// and <br> nodes are turned into newlines.
func WriteText(w io.Writer, node *html.Node) error {
switch node.Type {
case html.TextNode:
_, err := w.Write([]byte(html.UnescapeString(node.Data)))
if err != nil {
return err
}
case html.ElementNode:
switch node.DataAtom {
case atom.Br:
_, err := w.Write([]byte("\n"))
return err
case atom.Script, atom.Style:
return nil
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
err := WriteText(w, child)
if err != nil {
return err
}
}
return nil
}
// Text returns the content of the tree rooted at `node` as plain text.
// HTML entities are decoded,
// and <br> nodes are turned into newlines.
func Text(node *html.Node) (string, error) {
buf := new(bytes.Buffer)
err := WriteText(buf, node)
return buf.String(), err
}
// Prune returns a copy of `node` and its children,
// minus any subnodes that cause the supplied predicate to return true.
// If `node` itself is pruned, the return value is nil.
func Prune(node *html.Node, pred func(*html.Node) bool) *html.Node {
if pred(node) {
return nil
}
var children []*html.Node
for child := node.FirstChild; child != nil; child = child.NextSibling {
pruned := Prune(child, pred)
if pruned == nil {
continue
}
children = append(children, pruned)
}
for i, child := range children {
if i == 0 {
child.PrevSibling = nil
} else {
child.PrevSibling = children[i-1]
}
if i == len(children)-1 {
child.NextSibling = nil
} else {
child.NextSibling = children[i+1]
}
}
result := *node
if len(children) > 0 {
result.FirstChild = children[0]
result.LastChild = children[len(children)-1]
} else {
result.FirstChild = nil
result.LastChild = nil
}
return &result
}