forked from wcong/ants-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.go
117 lines (107 loc) · 2.78 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package crawler
import (
"github.com/wcong/ants-go/ants/http"
"github.com/wcong/ants-go/ants/spiders"
"log"
"strconv"
"time"
)
const (
SCRAPY_STATUS_STOP = iota
SCRAPY_STATUS_STOPED
SCRAPY_STATUS_RUNING
SCRAPY_STATUS_PAUSE
)
type ScrapeResult struct {
Request *http.Request
CrawlResult string // if success just empty string,or error reason
ScrapedRequests []*http.Request
}
type Scraper struct {
Status int
ResultQuene *ResultQuene
ResponseQuene *ResponseQuene
SpiderMap map[string]*spiders.Spider
}
//
func NewScraper(resultQuene *ResultQuene, responseQuene *ResponseQuene, spiderMap map[string]*spiders.Spider) *Scraper {
return &Scraper{SCRAPY_STATUS_STOPED, resultQuene, responseQuene, spiderMap}
}
func (this *Scraper) Start() {
if this.Status == SCRAPY_STATUS_RUNING {
return
}
for {
if this.Status == SCRAPY_STATUS_STOPED {
break
}
time.Sleep(1 * time.Second)
}
log.Println("start scraper")
this.Status = SCRAPY_STATUS_RUNING
this.Scrapy()
}
func (this *Scraper) Stop() {
this.Status = SCRAPY_STATUS_STOP
}
func (this *Scraper) Pause() {
if this.Status == SCRAPY_STATUS_RUNING {
this.Status = SCRAPY_STATUS_PAUSE
}
}
func (this *Scraper) UnPause() {
if this.Status == SCRAPY_STATUS_PAUSE {
this.Status = SCRAPY_STATUS_RUNING
}
}
// dead loop for scrapy
// pop a response
// scrapy it
// if scrapy some request, push it to quene
func (this *Scraper) Scrapy() {
for {
if this.Status == SCRAPY_STATUS_PAUSE {
time.Sleep(1 * time.Second)
continue
}
if this.Status != SCRAPY_STATUS_RUNING {
this.Status = SCRAPY_STATUS_STOPED
break
}
response := this.ResponseQuene.Pop()
if response == nil {
time.Sleep(1 * time.Second)
continue
}
go this.scrapyAndPush(response)
}
}
// scrapy and push in go routine
func (this *Scraper) scrapyAndPush(response *http.Response) {
log.Println(response.SpiderName, ":start to scrapy:", response.Request.GoRequest.URL.String())
defer func() {
err := recover()
if err != nil {
log.Println(response.SpiderName, " ", err)
scrapeResult := &ScrapeResult{}
scrapeResult.Request = response.Request
scrapeResult.CrawlResult = err.(error).Error()
this.ResultQuene.Push(scrapeResult)
}
}()
requestList, err := this.SpiderMap[response.SpiderName].ParseMap[response.ParserName](response)
scrapeResult := &ScrapeResult{}
scrapeResult.Request = response.Request
if err != nil {
log.Println(err)
scrapeResult.CrawlResult = err.Error()
}
if requestList != nil {
for _, request := range requestList {
request.Depth = response.Request.Depth + 1
}
scrapeResult.ScrapedRequests = requestList
log.Println(response.SpiderName, ":scrapyed:", strconv.Itoa(len(requestList)), "requests from:", response.GoResponse.Request.URL.String())
}
this.ResultQuene.Push(scrapeResult)
}