Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def endElement(self, name: str) -> None:
elif name == 'changefreq' and text in VALID_CHANGE_FREQS:
self._current_url['changefreq'] = text

self.current_tag = None
self._current_tag = None

if name == 'url' and 'loc' in self._current_url:
self.items.append({'type': 'url', **self._current_url})
Expand Down Expand Up @@ -156,7 +156,7 @@ async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
url = self._buffer.strip()
if url:
yield {'type': 'url', 'loc': url}
self.buffer = ''
self._buffer = ''

def close(self) -> None:
"""Clean up resources."""
Expand Down
36 changes: 35 additions & 1 deletion tests/unit/_utils/test_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@

from yarl import URL

from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
from crawlee._utils.sitemap import (
ParseSitemapOptions,
Sitemap,
SitemapUrl,
_TxtSitemapParser,
_XMLSaxSitemapHandler,
discover_valid_sitemaps,
parse_sitemap,
)
from crawlee.http_clients._base import HttpClient, HttpResponse

BASIC_SITEMAP = """
Expand Down Expand Up @@ -347,6 +355,32 @@ async def test_discover_sitemaps_multiple_domains() -> None:
}


def test_xml_handler_resets_current_tag_on_end_element() -> None:
"""Closing a tracked tag resets the handler's current tag so stray text between elements is ignored."""
handler = _XMLSaxSitemapHandler()
handler.startElement('urlset', MagicMock())
handler.startElement('url', MagicMock())
handler.startElement('loc', MagicMock())
handler.characters('https://example.com/')
handler.endElement('loc')

assert handler._current_tag is None

# Stray text between elements must not be buffered.
handler.characters(' stray text ')
assert handler._buffer == 'https://example.com/'


async def test_txt_parser_flush_clears_buffer() -> None:
"""Feeding more data after flush() must not concatenate the previously flushed URL."""
parser = _TxtSitemapParser()
items = [item async for item in parser.process_chunk('https://a.com/\nhttps://b.com/')]
items += [item async for item in parser.flush()]
items += [item async for item in parser.process_chunk('https://c.com/\n')]

assert [item['loc'] for item in items] == ['https://a.com/', 'https://b.com/', 'https://c.com/']


async def test_discover_sitemap_url_without_host_skipped() -> None:
"""URLs without a host are skipped."""
http_client = _make_mock_client({})
Expand Down
Loading