diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index e6930fc195..023897e5d9 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -122,7 +122,7 @@ def endElement(self, name: str) -> None: elif name == 'changefreq' and text in VALID_CHANGE_FREQS: self._current_url['changefreq'] = text - self.current_tag = None + self._current_tag = None if name == 'url' and 'loc' in self._current_url: self.items.append({'type': 'url', **self._current_url}) @@ -156,7 +156,7 @@ async def flush(self) -> AsyncGenerator[_SitemapItem, None]: url = self._buffer.strip() if url: yield {'type': 'url', 'loc': url} - self.buffer = '' + self._buffer = '' def close(self) -> None: """Clean up resources.""" diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 9e6fd795b3..a450ff1262 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -6,7 +6,15 @@ from yarl import URL -from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap +from crawlee._utils.sitemap import ( + ParseSitemapOptions, + Sitemap, + SitemapUrl, + _TxtSitemapParser, + _XMLSaxSitemapHandler, + discover_valid_sitemaps, + parse_sitemap, +) from crawlee.http_clients._base import HttpClient, HttpResponse BASIC_SITEMAP = """ @@ -347,6 +355,32 @@ async def test_discover_sitemaps_multiple_domains() -> None: } +def test_xml_handler_resets_current_tag_on_end_element() -> None: + """Closing a tracked tag resets the handler's current tag so stray text between elements is ignored.""" + handler = _XMLSaxSitemapHandler() + handler.startElement('urlset', MagicMock()) + handler.startElement('url', MagicMock()) + handler.startElement('loc', MagicMock()) + handler.characters('https://example.com/') + handler.endElement('loc') + + assert handler._current_tag is None + + # Stray text between elements must not be buffered. + handler.characters(' stray text ') + assert handler._buffer == 'https://example.com/' + + +async def test_txt_parser_flush_clears_buffer() -> None: + """Feeding more data after flush() must not concatenate the previously flushed URL.""" + parser = _TxtSitemapParser() + items = [item async for item in parser.process_chunk('https://a.com/\nhttps://b.com/')] + items += [item async for item in parser.flush()] + items += [item async for item in parser.process_chunk('https://c.com/\n')] + + assert [item['loc'] for item in items] == ['https://a.com/', 'https://b.com/', 'https://c.com/'] + + async def test_discover_sitemap_url_without_host_skipped() -> None: """URLs without a host are skipped.""" http_client = _make_mock_client({})