Skip to content

Commit a43fa6a

Browse files
committed
Web crawler with concurrent crowling capability
1 parent 08b751b commit a43fa6a

File tree

3 files changed

+102
-0
lines changed

3 files changed

+102
-0
lines changed

06-web-crawler-concurrent/main.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"net/http"
6+
"runtime"
7+
"time"
8+
9+
"golang.org/x/net/html"
10+
)
11+
12+
var fetched map[string]bool
13+
14+
type result struct {
15+
url string
16+
urls []string
17+
err error
18+
depth int
19+
}
20+
21+
// Crawl uses findLinks to recursively crawl
22+
// pages starting with url, to a maximum of depth.
23+
func Crawl(url string, depth int) {
24+
runtime.GOMAXPROCS(runtime.NumCPU())
25+
results := make(chan *result)
26+
27+
fetch := func(url string, depth int) {
28+
urls, err := findLinks(url)
29+
results <- &result{url, urls, err, depth}
30+
}
31+
32+
go fetch(url, depth)
33+
fetched[url] = true
34+
35+
for fetching := 1; fetching > 0; fetching-- {
36+
res := <-results
37+
if res.err != nil {
38+
// fmt.Println(res.err)
39+
continue
40+
}
41+
42+
fmt.Printf("found: %s\n", res.url)
43+
if res.depth > 0 {
44+
for _, u := range res.urls {
45+
if !fetched[u] {
46+
fetching++
47+
go fetch(u, res.depth-1)
48+
fetched[u] = true
49+
}
50+
}
51+
}
52+
}
53+
close(results)
54+
}
55+
56+
func main() {
57+
fetched = make(map[string]bool)
58+
now := time.Now()
59+
Crawl("http://github.com/aditya43", 2)
60+
fmt.Println("time taken:", time.Since(now))
61+
}
62+
63+
func findLinks(url string) ([]string, error) {
64+
resp, err := http.Get(url)
65+
if err != nil {
66+
return nil, err
67+
}
68+
if resp.StatusCode != http.StatusOK {
69+
resp.Body.Close()
70+
return nil, fmt.Errorf("getting %s: %s", url, resp.Status)
71+
}
72+
doc, err := html.Parse(resp.Body)
73+
resp.Body.Close()
74+
if err != nil {
75+
return nil, fmt.Errorf("parsing %s as HTML: %v", url, err)
76+
}
77+
return visit(nil, doc), nil
78+
}
79+
80+
// visit appends to links each link found in n, and returns the result.
81+
func visit(links []string, n *html.Node) []string {
82+
if n.Type == html.ElementNode && n.Data == "a" {
83+
for _, a := range n.Attr {
84+
if a.Key == "href" {
85+
links = append(links, a.Val)
86+
}
87+
}
88+
}
89+
for c := n.FirstChild; c != nil; c = c.NextSibling {
90+
links = visit(links, c)
91+
}
92+
return links
93+
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module github.com/aditya43/golang_concurrency
22

33
go 1.16
4+
5+
require golang.org/x/net v0.0.0-20210525063256-abc453219eb5 // indirect

go.sum

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
golang.org/x/net v0.0.0-20210525063256-abc453219eb5 h1:wjuX4b5yYQnEQHzd+CBcrcC6OVR2J1CN6mUy0oSxIPo=
2+
golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
3+
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
4+
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
5+
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
6+
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
7+
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

0 commit comments

Comments
 (0)