From 8a36fc81f420dd91e0d2addf3211c08de37f28b8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 18 Nov 2025 17:23:38 +0100 Subject: [PATCH] Use `top_domain_under_public_suffix` instead of just `domain` when comparing domains --- src/crawlee/crawlers/_basic/_basic_crawler.py | 4 ++-- tests/unit/crawlers/_basic/test_basic_crawler.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 74d2aaff13..2074392972 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1043,8 +1043,8 @@ def _check_enqueue_strategy( return target_url.hostname == origin_url.hostname if strategy == 'same-domain': - origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain - target_domain = self._tld_extractor.extract_str(target_url.hostname).domain + origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix + target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix return origin_domain == target_domain if strategy == 'same-origin': diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index c7dad2725c..29d7559663 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -347,6 +347,7 @@ class AddRequestsTestInput: 'https://blog.someplace.com/index.html', 'https://redirect.someplace.com', 'https://other.place.com/index.html', + 'https://someplace.jp/', ) INCLUDE_TEST_URLS = ( @@ -401,7 +402,7 @@ class AddRequestsTestInput: AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], - requests=STRATEGY_TEST_URLS[:4], + requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-domain'), expected_urls=STRATEGY_TEST_URLS[1:4], ), @@ -411,7 +412,7 @@ class AddRequestsTestInput: AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], - requests=STRATEGY_TEST_URLS[:4], + requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-hostname'), expected_urls=[STRATEGY_TEST_URLS[1]], ), @@ -421,7 +422,7 @@ class AddRequestsTestInput: AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], - requests=STRATEGY_TEST_URLS[:4], + requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-origin'), expected_urls=[], ),