/
scrape_hn.go
44 lines (36 loc) · 1.08 KB
/
scrape_hn.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package main
import (
"encoding/json"
"fmt"
"os"
"regexp"
"github.com/andrew-d/goscrape"
"github.com/andrew-d/goscrape/extract"
"github.com/andrew-d/goscrape/paginate"
)
func main() {
config := &scrape.ScrapeConfig{
DividePage: scrape.DividePageBySelector("tr:nth-child(3) tr:nth-child(3n-2):not([style='height:10px'])"),
Pieces: []scrape.Piece{
{Name: "title", Selector: "td.title > a", Extractor: extract.Text{}},
{Name: "link", Selector: "td.title > a", Extractor: extract.Attr{Attr: "href"}},
{Name: "rank", Selector: "td.title[align='right']",
Extractor: extract.Regex{Regex: regexp.MustCompile(`(\d+)`)}},
},
Paginator: paginate.BySelector("a[rel='nofollow']:last-child", "href"),
}
scraper, err := scrape.New(config)
if err != nil {
fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err)
os.Exit(1)
}
results, err := scraper.ScrapeWithOpts(
"https://news.ycombinator.com",
scrape.ScrapeOptions{MaxPages: 3},
)
if err != nil {
fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err)
os.Exit(1)
}
json.NewEncoder(os.Stdout).Encode(results)
}