Skip to content

Commit

Permalink
Don't require space after "Sitemap:"
Browse files Browse the repository at this point in the history
Fixes #9.
  • Loading branch information
pypt committed Jul 15, 2019
1 parent 29f3521 commit b40b3c3
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 1 deletion.
51 changes: 51 additions & 0 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,3 +779,54 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self):
actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)

assert len(actual_sitemap_tree.all_pages()) == page_count

def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):
"""Test sitemap_tree_for_homepage() with weird (but valid) spacing."""

httpretty.register_uri(
httpretty.GET,
self.TEST_BASE_URL + '/',
body='This is a homepage.',
)

robots_txt_body = ""
robots_txt_body += "User-agent: *\n"
# Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL
robots_txt_body += " Sitemap:{base_url}/sitemap.xml ".format(base_url=self.TEST_BASE_URL)

httpretty.register_uri(
httpretty.GET,
self.TEST_BASE_URL + '/robots.txt',
adding_headers={'Content-Type': 'text/plain'},
body=robots_txt_body,
)

httpretty.register_uri(
httpretty.GET,
self.TEST_BASE_URL + '/sitemap.xml',
body=textwrap.dedent("""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
<url>
<loc>{base_url}/news/first.html</loc>
<news:news>
<news:publication>
<news:name>{publication_name}</news:name>
<news:language>{publication_language}</news:language>
</news:publication>
<news:publication_date>{publication_date}</news:publication_date>
<news:title>First story</news:title>
</news:news>
</url>
</urlset>
""".format(
base_url=self.TEST_BASE_URL,
publication_name=self.TEST_PUBLICATION_NAME,
publication_language=self.TEST_PUBLICATION_LANGUAGE,
publication_date=self.TEST_DATE_STR,
)).strip(),
)

actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
assert len(actual_sitemap_tree.all_pages()) == 1
2 changes: 1 addition & 1 deletion usp/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def sitemap(self) -> AbstractSitemap:
robots_txt_line = robots_txt_line.strip()
# robots.txt is supposed to be case sensitive but who cares in these Node.js times?
robots_txt_line = robots_txt_line.lower()
sitemap_match = re.search(r'^sitemap: (.+?)$', robots_txt_line, flags=re.IGNORECASE)
sitemap_match = re.search(r'^sitemap:\s*(.+?)$', robots_txt_line, flags=re.IGNORECASE)
if sitemap_match:
sitemap_url = sitemap_match.group(1)
if is_http_url(sitemap_url):
Expand Down

0 comments on commit b40b3c3

Please sign in to comment.