/
crawler.go
81 lines (61 loc) · 2.18 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package main
import (
"strings"
"bytes"
"encoding/gob"
"os"
"strconv"
"sync"
"github.com/PuerkitoBio/goquery"
)
func crawl() {
doc, _ := goquery.NewDocument(URL)
tmp_comics := make([]XKCDComic, 0)
mux := &sync.Mutex{}
var wg sync.WaitGroup
doc.Find("tr").Each(func(i int, row *goquery.Selection) {
wg.Add(1)
go func() {
defer wg.Done()
comic := XKCDComic{}
explanationURL := ""
row.Find("td").Each(func(j int, col *goquery.Selection) {
text := strings.TrimSpace(col.Text())
switch j {
case 0:
comic.URL = text
comic.Number, _ = strconv.Atoi(text[strings.Index(text, "/")+1:])
case 1:
comic.Title = strings.TrimSpace(text[:strings.Index(text, "(create)")-1])
comic.TitleFields = strings.Fields(comic.Title)
explanationURL, _ = col.Find("a").Attr("href")
explanationURL = "http://www.explainxkcd.com" + explanationURL[:15] + "?action=edit&title=" + explanationURL[16:]
exp, err := goquery.NewDocument(explanationURL)
if err == nil {
comic.Text = exp.Find("textarea").Text()
}
case 3:
comic.Image = "https://imgs.xkcd.com/comics/" + strings.Replace(text, " ", "_", -1)
case 4:
comic.Date = text
}
})
index := strings.Index(comic.Text, "titletext = ")
if index > 0 {
comic.TitleText = comic.Text[index+12:]
comic.TitleText = comic.TitleText[:strings.Index(comic.TitleText, "}")-1]
}
mux.Lock()
tmp_comics = append(tmp_comics, comic)
mux.Unlock()
}()
})
wg.Wait()
var buf bytes.Buffer
enc := gob.NewEncoder(&buf)
enc.Encode(tmp_comics)
f, _ := os.Create("comics.bin")
f.Write(buf.Bytes())
f.Close()
comics = tmp_comics
}