From dad3796b8ef6c4dd9f9286cd6c91cdd14e167fc2 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 23 Jul 2024 11:51:50 +0200 Subject: [PATCH] Strip whitespace from href in enqueue_links --- src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py | 2 +- src/crawlee/playwright_crawler/playwright_crawler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py index c8c28a1d68..4e3da39c72 100644 --- a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py @@ -130,7 +130,7 @@ async def enqueue_links( link_user_data.setdefault('label', label) if (href := link.attrs.get('href')) is not None: - requests.append(BaseRequestData.from_url(href, user_data=link_user_data)) + requests.append(BaseRequestData.from_url(href.strip(), user_data=link_user_data)) await context.add_requests(requests, **kwargs) diff --git a/src/crawlee/playwright_crawler/playwright_crawler.py b/src/crawlee/playwright_crawler/playwright_crawler.py index d7cc80e0f5..94463fbcb1 100644 --- a/src/crawlee/playwright_crawler/playwright_crawler.py +++ b/src/crawlee/playwright_crawler/playwright_crawler.py @@ -134,7 +134,7 @@ async def enqueue_links( if label is not None: link_user_data.setdefault('label', label) - request = BaseRequestData.from_url(href, user_data=link_user_data) + request = BaseRequestData.from_url(href.strip(), user_data=link_user_data) requests.append(request) await context.add_requests(requests, **kwargs)