Skip to content

Commit

Permalink
add tool to convert sitemaps into url lists, closes #5
Browse files Browse the repository at this point in the history
  • Loading branch information
atomicptr committed Apr 30, 2020
1 parent 7f5d871 commit a3d1c31
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pkg/cli/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cli

import (
"github.com/atomicptr/crab/pkg/cli/crawl"
"github.com/atomicptr/crab/pkg/cli/tools"
"github.com/atomicptr/crab/pkg/meta"
"github.com/spf13/cobra"
)
Expand All @@ -16,4 +17,6 @@ func init() {
rootCommand.AddCommand(crawl.Command)
rootCommand.AddCommand(crawl.SitemapCommand)
rootCommand.AddCommand(crawl.ListCommand)

rootCommand.AddCommand(tools.ConvertSitemapToUrllistCommand)
}
78 changes: 78 additions & 0 deletions pkg/cli/tools/cmd_convert_sitemap_to_urllist.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package tools

import (
"fmt"
"github.com/atomicptr/crab/pkg/sitemap"
"github.com/spf13/cobra"
"net/http"
"net/url"
"os"
"time"
)

var (
flagRemoveBaseUrl = false
)

var ConvertSitemapToUrllistCommand = &cobra.Command{
Use: "tools:convert-sitemap-to-urllist [sitemapPath]",
Short: "Convert a sitemap to an url list and print it to stdout",
Run: func(cmd *cobra.Command, args []string) {
if len(args) != 1 {
fmt.Println("You have to specify exactly one url or file path to a sitemap xml\n" +
"\tUsage: crab tools:convert-sitemap-to-urllist https://domain.com/sitemap.xml")
os.Exit(1)
}

sitemapPath := args[0]

urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &http.Client{Timeout: 30 * time.Second})
if err != nil {
fmt.Printf("Could not read sitemap from %s\n\t%s\n", sitemapPath, err)
os.Exit(1)
}

if flagRemoveBaseUrl {
urls = removeBaseUrls(urls)
}

for _, url := range urls {
fmt.Println(url)
}
},
}

func removeBaseUrls(urls []string) []string {
newUrls := make([]string, len(urls))

for i, oldUrl := range urls {
u, err := url.Parse(oldUrl)
if err != nil {
continue
}

query := u.RawQuery
if query != "" {
query = "?" + query
}

fragment := u.Fragment
if fragment != "" {
fragment = "#" + fragment
}

newUrls[i] = u.Path + query + fragment
}

return newUrls
}

func init() {
ConvertSitemapToUrllistCommand.PersistentFlags().BoolVarP(
&flagRemoveBaseUrl,
"remove-base-url",
"",
false,
"remove base url from urls",
)
}
33 changes: 33 additions & 0 deletions pkg/cli/tools/cmd_convert_sitemap_to_urllist_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package tools

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestRemoveBaseUrl(t *testing.T) {
expected := map[string]string{
"https://domain.com/test-url": "/test-url",
"https://domain.com/test-url/": "/test-url/",
"https://domain.com/test-url/test": "/test-url/test",
"https://domain.com/test-url/test#test1234": "/test-url/test#test1234",
"https://domain.com/test-url/test?x=1234": "/test-url/test?x=1234",
"https://domain.com/test-url/test?x=1234&y=12345": "/test-url/test?x=1234&y=12345",
"https://domain.com/test-url/test?x=1234&z=/test/asdf#yay": "/test-url/test?x=1234&z=/test/asdf#yay",
}

input := make([]string, len(expected))

i := 0

for k := range expected {
input[i] = k
i++
}

result := removeBaseUrls(input)

for i, url := range result {
assert.Equal(t, expected[input[i]], url)
}
}

0 comments on commit a3d1c31

Please sign in to comment.