forked from gocolly/colly
-
Notifications
You must be signed in to change notification settings - Fork 0
/
colly.go
107 lines (91 loc) · 2.33 KB
/
colly.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package main
import (
"bytes"
"fmt"
"log"
"os"
"strings"
"github.com/jawher/mow.cli"
)
var scraperHeadTemplate = `package main
import (
"log"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
`
var scraperEndTemplate = `
c.Visit("https://yourdomain.com/")
}
`
var htmlCallbackTemplate = `
c.OnHTML("element-selector", func(e *colly.HTMLElement) {
log.Println(e.Text)
})
`
var requestCallbackTemplate = `
c.OnRequest("element-selector", func(r *colly.Request) {
log.Println("Visiting", r.URL)
})
`
var responseCallbackTemplate = `
c.OnResponse("element-selector", func(r *colly.Response) {
log.Println("Visited", r.Request.URL, r.StatusCode)
})
`
var errorCallbackTemplate = `
c.OnError("element-selector", func(r *colly.Response, err error) {
log.Printf("Error on %s: %s", r.Request.URL, err)
})
`
func main() {
app := cli.App("colly", "Scraping Framework for Gophers")
app.Command("new", "Create new scraper", func(cmd *cli.Cmd) {
var (
callbacks = cmd.StringOpt("callbacks", "", "Add callbacks to the template. (E.g. '--callbacks=html,response,error')")
hosts = cmd.StringOpt("hosts", "", "Specify scraper's allowed hosts. (e.g. '--hosts=xy.com,abcd.com')")
path = cmd.StringArg("PATH", "", "Path of the new scraper")
)
cmd.Spec = "[--callbacks] [--hosts] [PATH]"
cmd.Action = func() {
scraper := bytes.NewBufferString(scraperHeadTemplate)
outfile := os.Stdout
if *path != "" {
var err error
outfile, err = os.Create(*path)
if err != nil {
log.Fatal(err)
}
defer outfile.Close()
}
if *hosts != "" {
scraper.WriteString("\n c.AllowedDomains = []string{")
for i, h := range strings.Split(*hosts, ",") {
if i > 0 {
scraper.WriteString(", ")
}
scraper.WriteString(fmt.Sprintf("%q", h))
}
scraper.WriteString("}\n")
}
if len(*callbacks) > 0 {
for _, c := range strings.Split(*callbacks, ",") {
switch c {
case "html":
scraper.WriteString(htmlCallbackTemplate)
case "request":
scraper.WriteString(requestCallbackTemplate)
case "response":
scraper.WriteString(responseCallbackTemplate)
case "error":
scraper.WriteString(errorCallbackTemplate)
}
}
}
scraper.WriteString(scraperEndTemplate)
outfile.Write(scraper.Bytes())
}
})
app.Run(os.Args)
}