Skip to content

Commit

Permalink
apply request modifiers to innitial sitemap fetch
Browse files Browse the repository at this point in the history
  • Loading branch information
atomicptr committed Dec 28, 2022
1 parent f31c8ef commit 44d283b
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 9 deletions.
23 changes: 22 additions & 1 deletion pkg/cli/crawl/cmd_sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ var SitemapCommand = &cobra.Command{

sitemapPath := args[0]

urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &http.Client{Timeout: sitemapCommandFlags.HttpTimeout})
client := http.Client{Timeout: sitemapCommandFlags.HttpTimeout}

sitemapModifiers := crawler.RequestModifier{}
applySitemapModifiers(&sitemapModifiers, sitemapCommandFlags)

urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &client, &sitemapModifiers)
if err != nil {
fmt.Printf("Could not read sitemap from %s\n\t%s\n", sitemapPath, err)
os.Exit(1)
Expand All @@ -46,6 +51,22 @@ var SitemapCommand = &cobra.Command{
},
}

func applySitemapModifiers(modifier *crawler.RequestModifier, flagOptions crawlerFlagOptions) {
modifier.With(addUserAgentToRequest())

if len(flagOptions.AuthUsername) > 0 || len(flagOptions.AuthPassword) > 0 {
modifier.With(addHttpBasicAuthToRequest(flagOptions))
}

if len(flagOptions.CookieStrings) > 0 {
modifier.With(addCookiesToRequest(flagOptions))
}

if len(flagOptions.HeaderStrings) > 0 {
modifier.With(addHeadersToRequest(flagOptions))
}
}

func init() {
registerStandardCrawlCommandFlags(SitemapCommand, &sitemapCommandFlags)
}
2 changes: 1 addition & 1 deletion pkg/cli/tools/cmd_convert_sitemap_to_urllist.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ var ConvertSitemapToUrllistCommand = &cobra.Command{

sitemapPath := args[0]

urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &http.Client{Timeout: 30 * time.Second})
urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &http.Client{Timeout: 30 * time.Second}, nil)
if err != nil {
fmt.Printf("Could not read sitemap from %s\n\t%s\n", sitemapPath, err)
os.Exit(1)
Expand Down
24 changes: 17 additions & 7 deletions pkg/sitemap/sitemap.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package sitemap

import (
"github.com/atomicptr/crab/pkg/crawler"
"github.com/beevik/etree"
"github.com/pkg/errors"
"io"
Expand All @@ -9,10 +10,10 @@ import (
"strings"
)

func FetchUrlsFromPath(path string, client *http.Client) ([]string, error) {
func FetchUrlsFromPath(path string, client *http.Client, modifier *crawler.RequestModifier) ([]string, error) {
var urls []string

xmlDataBlob, err := fetchXml(path, client)
xmlDataBlob, err := fetchXml(path, client, modifier)
if err != nil {
return nil, err
}
Expand All @@ -29,7 +30,7 @@ func FetchUrlsFromPath(path string, client *http.Client) ([]string, error) {
for _, sitemap := range sitemapIndex.ChildElements() {
loc := sitemap.FindElement("loc")
if loc != nil {
sitemapUrls, err := FetchUrlsFromPath(loc.Text(), client)
sitemapUrls, err := FetchUrlsFromPath(loc.Text(), client, modifier)
if err != nil {
return nil, err
}
Expand All @@ -54,15 +55,24 @@ func FetchUrlsFromPath(path string, client *http.Client) ([]string, error) {
return urls, nil
}

func fetchXml(path string, client *http.Client) (io.Reader, error) {
func fetchXml(path string, client *http.Client, modifier *crawler.RequestModifier) (io.Reader, error) {
if strings.HasPrefix(path, "http") {
return fetchXmlFromWeb(path, client)
return fetchXmlFromWeb(path, client, modifier)
}
return fetchXmlFromFile(path)
}

func fetchXmlFromWeb(path string, client *http.Client) (io.Reader, error) {
resp, err := client.Get(path)
func fetchXmlFromWeb(path string, client *http.Client, modifier *crawler.RequestModifier) (io.Reader, error) {
req, err := http.NewRequest("GET", path, nil)
if err != nil {
return nil, err
}

if modifier != nil {
modifier.Do(req)
}

resp, err := client.Do(req)
if err != nil {
return nil, err
}
Expand Down

0 comments on commit 44d283b

Please sign in to comment.