/
commoncrawl.go
113 lines (96 loc) · 2.79 KB
/
commoncrawl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// Package commoncrawl logic
package commoncrawl
import (
"bufio"
"context"
"fmt"
"net/url"
"strings"
jsoniter "github.com/json-iterator/go"
"github.com/ZhuriLab/Starmap/pkg/subscraping"
)
const indexURL = "https://index.commoncrawl.org/collinfo.json"
type indexResponse struct {
ID string `json:"id"`
APIURL string `json:"cdx-api"`
}
// Source is the passive scraping agent
type Source struct{}
var years = [...]string{"2020", "2019", "2018", "2017"}
// Run function returns all subdomains found with the service
func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
results := make(chan subscraping.Result)
go func() {
defer close(results)
resp, err := session.SimpleGet(ctx, indexURL)
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
session.DiscardHTTPResponse(resp)
return
}
var indexes []indexResponse
err = jsoniter.NewDecoder(resp.Body).Decode(&indexes)
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
resp.Body.Close()
return
}
resp.Body.Close()
searchIndexes := make(map[string]string)
for _, year := range years {
for _, index := range indexes {
if strings.Contains(index.ID, year) {
if _, ok := searchIndexes[year]; !ok {
searchIndexes[year] = index.APIURL
break
}
}
}
}
for _, apiURL := range searchIndexes {
further := s.getSubdomains(ctx, apiURL, domain, session, results)
if !further {
break
}
}
}()
return results
}
// Name returns the name of the source
func (s *Source) Name() string {
return "commoncrawl"
}
func (s *Source) getSubdomains(ctx context.Context, searchURL, domain string, session *subscraping.Session, results chan subscraping.Result) bool {
for {
select {
case <-ctx.Done():
return false
default:
var headers = map[string]string{"Host": "index.commoncrawl.org"}
resp, err := session.Get(ctx, fmt.Sprintf("%s?url=*.%s", searchURL, domain), "", headers)
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
session.DiscardHTTPResponse(resp)
return false
}
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if line == "" {
continue
}
line, _ = url.QueryUnescape(line)
subdomain := session.Extractor.FindString(line)
if subdomain != "" {
// fix for triple encoded URL
subdomain = strings.ToLower(subdomain)
subdomain = strings.TrimPrefix(subdomain, "25")
subdomain = strings.TrimPrefix(subdomain, "2f")
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
}
}
resp.Body.Close()
return true
}
}
}