From fdaf1c1229452184944b3dacc6a09be2ce0cf324 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 09:13:33 +0200 Subject: [PATCH 1/6] Multiple attempts to create context, add lock --- .../_playwright_browser_controller.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 68c6167119..d4ce4ac64f 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -2,6 +2,7 @@ from __future__ import annotations +from asyncio import Lock from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any, cast @@ -77,6 +78,29 @@ def __init__( self._total_opened_pages = 0 + self._context_creation_lock: Lock | None = None + + + async def _get_context_creation_lock(self) -> Lock: + """Context checking and creation should be done with lock to prevent multiple attempts to create context.""" + if self._context_creation_lock: + return self._context_creation_lock + self._context_creation_lock= Lock() + return self._context_creation_lock + + async def ensure_browser_context(self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> None: + async with await self._get_context_creation_lock(): + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + + + @property @override def pages(self) -> list[Page]: From f2e1b6f77e603b5f82352d8da10ef81a0a1c2ff9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 09:37:21 +0200 Subject: [PATCH 2/6] Add some extra logs --- .../_playwright_browser_controller.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index d4ce4ac64f..54be49b769 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -88,7 +88,7 @@ async def _get_context_creation_lock(self) -> Lock: self._context_creation_lock= Lock() return self._context_creation_lock - async def ensure_browser_context(self, + async def _ensure_browser_context(self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> None: @@ -99,8 +99,6 @@ async def ensure_browser_context(self, proxy_info=proxy_info, ) - - @property @override def pages(self) -> list[Page]: @@ -161,12 +159,6 @@ async def new_page( Raises: ValueError: If the browser has reached the maximum number of open pages. """ - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - if not self.has_free_capacity: raise ValueError('Cannot open more pages in this browser.') @@ -178,11 +170,7 @@ async def new_page( ) page = await new_context.new_page() else: - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) + await self._ensure_browser_context() page = await self._browser_context.new_page() # Handle page close event @@ -229,8 +217,8 @@ async def _create_browser_context( Create context without headers and without fingerprints if neither `self._header_generator` nor `self._fingerprint_generator` is available. """ + start = datetime.now(timezone.utc) browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} - if proxy_info: if browser_new_context_options.get('proxy'): logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.") @@ -242,11 +230,14 @@ async def _create_browser_context( ) if self._fingerprint_generator: - return await AsyncNewContext( + + c = await AsyncNewContext( browser=self._browser, fingerprint=self._fingerprint_generator.generate(), **browser_new_context_options, ) + logger.warning(f"Fingerprint generation time [s]: {(datetime.now(timezone.utc) - start).total_seconds()}") + return c if self._header_generator: extra_http_headers = dict( @@ -268,5 +259,5 @@ async def _create_browser_context( browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( 'extra_http_headers', extra_http_headers ) - + logger.warning(f"Fingerprint generation time [s]: {(datetime.now(timezone.utc)-start).total_seconds()}") return await self._browser.new_context(**browser_new_context_options) From 6f182b010a0c04563d3d479cc6770cc89661ab85 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 10:18:49 +0200 Subject: [PATCH 3/6] Add fix. TODO: Figure out test --- .../_playwright_browser_controller.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 54be49b769..607a330feb 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -80,24 +80,30 @@ def __init__( self._context_creation_lock: Lock | None = None - async def _get_context_creation_lock(self) -> Lock: - """Context checking and creation should be done with lock to prevent multiple attempts to create context.""" + """Get context checking and creation lock. + + It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to + memory leak as one of the two concurrently created contexts will become orphaned and not properly closed. + """ if self._context_creation_lock: return self._context_creation_lock - self._context_creation_lock= Lock() + self._context_creation_lock = Lock() return self._context_creation_lock - async def _ensure_browser_context(self, - browser_new_context_options: Mapping[str, Any] | None = None, - proxy_info: ProxyInfo | None = None, - ) -> None: + async def _get_browser_context( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> BrowserContext: + """Ensure that the browser context is set.""" async with await self._get_context_creation_lock(): if not self._browser_context: self._browser_context = await self._create_browser_context( browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ) + return self._browser_context @property @override @@ -170,8 +176,8 @@ async def new_page( ) page = await new_context.new_page() else: - await self._ensure_browser_context() - page = await self._browser_context.new_page() + _browser_context = await self._get_browser_context() + page = await _browser_context.new_page() # Handle page close event page.on(event='close', f=self._on_page_close) @@ -181,7 +187,6 @@ async def new_page( self._last_page_opened_at = datetime.now(timezone.utc) self._total_opened_pages += 1 - return page @override @@ -217,7 +222,6 @@ async def _create_browser_context( Create context without headers and without fingerprints if neither `self._header_generator` nor `self._fingerprint_generator` is available. """ - start = datetime.now(timezone.utc) browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} if proxy_info: if browser_new_context_options.get('proxy'): @@ -230,14 +234,11 @@ async def _create_browser_context( ) if self._fingerprint_generator: - - c = await AsyncNewContext( + return await AsyncNewContext( browser=self._browser, fingerprint=self._fingerprint_generator.generate(), **browser_new_context_options, ) - logger.warning(f"Fingerprint generation time [s]: {(datetime.now(timezone.utc) - start).total_seconds()}") - return c if self._header_generator: extra_http_headers = dict( @@ -259,5 +260,4 @@ async def _create_browser_context( browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( 'extra_http_headers', extra_http_headers ) - logger.warning(f"Fingerprint generation time [s]: {(datetime.now(timezone.utc)-start).total_seconds()}") return await self._browser.new_context(**browser_new_context_options) From 5f8baea07c292ff04eec900df2dace5e9d1a390e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 13:20:50 +0200 Subject: [PATCH 4/6] Add test --- .../test_playwright_browser_controller.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/unit/browsers/test_playwright_browser_controller.py b/tests/unit/browsers/test_playwright_browser_controller.py index de1bb0ffcb..1509f4b29d 100644 --- a/tests/unit/browsers/test_playwright_browser_controller.py +++ b/tests/unit/browsers/test_playwright_browser_controller.py @@ -2,12 +2,14 @@ import asyncio from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any +from unittest import mock +from unittest.mock import AsyncMock import pytest from playwright.async_api import Browser, Playwright, async_playwright -from crawlee.browsers import PlaywrightBrowserController +from crawlee.browsers import PlaywrightBrowserController, PlaywrightBrowserPlugin, PlaywrightPersistentBrowser if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -106,3 +108,28 @@ async def test_close_browser_with_open_pages(browser: Browser) -> None: assert controller.pages_count == 0 assert not controller.is_browser_connected + + +async def test_memory_leak_on_concurrent_context_creation() -> None: + """Test that only one browser context is created when multiple pages are opened concurrently.""" + + # Prepare mocked browser with relevant methods and attributes + mocked_browser = AsyncMock() + mocked_context_launcher = AsyncMock() + async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> AsyncMock: + """Ensure that both calls to create context overlap in time.""" + await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens + return await mocked_context_launcher(*args, **kwargs) + mocked_browser.launch_persistent_context = delayed_launch_persistent_context + + # Create minimal instance of PlaywrightBrowserController with mocked browser + controller = PlaywrightBrowserController( + PlaywrightPersistentBrowser(mocked_browser,None, {}), + header_generator=None, + fingerprint_generator=None + ) + + # Both calls will try to create browser context at the same time, but only one context should be created. + await asyncio.gather(controller.new_page(), controller.new_page()) + + assert mocked_context_launcher.call_count == 1 From 0189190a8957a6b46e7f15b7deadd48eed8d8364 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 13:22:24 +0200 Subject: [PATCH 5/6] Format --- .../browsers/test_playwright_browser_controller.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/browsers/test_playwright_browser_controller.py b/tests/unit/browsers/test_playwright_browser_controller.py index 1509f4b29d..e7b8d517ff 100644 --- a/tests/unit/browsers/test_playwright_browser_controller.py +++ b/tests/unit/browsers/test_playwright_browser_controller.py @@ -3,13 +3,12 @@ import asyncio from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any -from unittest import mock from unittest.mock import AsyncMock import pytest from playwright.async_api import Browser, Playwright, async_playwright -from crawlee.browsers import PlaywrightBrowserController, PlaywrightBrowserPlugin, PlaywrightPersistentBrowser +from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -116,17 +115,17 @@ async def test_memory_leak_on_concurrent_context_creation() -> None: # Prepare mocked browser with relevant methods and attributes mocked_browser = AsyncMock() mocked_context_launcher = AsyncMock() - async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> AsyncMock: + + async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any: """Ensure that both calls to create context overlap in time.""" await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens return await mocked_context_launcher(*args, **kwargs) + mocked_browser.launch_persistent_context = delayed_launch_persistent_context # Create minimal instance of PlaywrightBrowserController with mocked browser controller = PlaywrightBrowserController( - PlaywrightPersistentBrowser(mocked_browser,None, {}), - header_generator=None, - fingerprint_generator=None + PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None ) # Both calls will try to create browser context at the same time, but only one context should be created. From ef063a1dd19ce60a5a4644c799078c12df18d5d7 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 1 Oct 2025 13:37:07 +0200 Subject: [PATCH 6/6] Fix failing ut --- .../_playwright_browser_controller.py | 23 ++++++------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 607a330feb..9f04bb2861 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -91,20 +91,6 @@ async def _get_context_creation_lock(self) -> Lock: self._context_creation_lock = Lock() return self._context_creation_lock - async def _get_browser_context( - self, - browser_new_context_options: Mapping[str, Any] | None = None, - proxy_info: ProxyInfo | None = None, - ) -> BrowserContext: - """Ensure that the browser context is set.""" - async with await self._get_context_creation_lock(): - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - return self._browser_context - @property @override def pages(self) -> list[Page]: @@ -176,8 +162,13 @@ async def new_page( ) page = await new_context.new_page() else: - _browser_context = await self._get_browser_context() - page = await _browser_context.new_page() + async with await self._get_context_creation_lock(): + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + page = await self._browser_context.new_page() # Handle page close event page.on(event='close', f=self._on_page_close)