diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 68c6167119..9f04bb2861 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -2,6 +2,7 @@ from __future__ import annotations +from asyncio import Lock from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any, cast @@ -77,6 +78,19 @@ def __init__( self._total_opened_pages = 0 + self._context_creation_lock: Lock | None = None + + async def _get_context_creation_lock(self) -> Lock: + """Get context checking and creation lock. + + It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to + memory leak as one of the two concurrently created contexts will become orphaned and not properly closed. + """ + if self._context_creation_lock: + return self._context_creation_lock + self._context_creation_lock = Lock() + return self._context_creation_lock + @property @override def pages(self) -> list[Page]: @@ -137,12 +151,6 @@ async def new_page( Raises: ValueError: If the browser has reached the maximum number of open pages. """ - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - if not self.has_free_capacity: raise ValueError('Cannot open more pages in this browser.') @@ -154,11 +162,12 @@ async def new_page( ) page = await new_context.new_page() else: - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) + async with await self._get_context_creation_lock(): + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) page = await self._browser_context.new_page() # Handle page close event @@ -169,7 +178,6 @@ async def new_page( self._last_page_opened_at = datetime.now(timezone.utc) self._total_opened_pages += 1 - return page @override @@ -206,7 +214,6 @@ async def _create_browser_context( `self._fingerprint_generator` is available. """ browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} - if proxy_info: if browser_new_context_options.get('proxy'): logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.") @@ -244,5 +251,4 @@ async def _create_browser_context( browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( 'extra_http_headers', extra_http_headers ) - return await self._browser.new_context(**browser_new_context_options) diff --git a/tests/unit/browsers/test_playwright_browser_controller.py b/tests/unit/browsers/test_playwright_browser_controller.py index de1bb0ffcb..e7b8d517ff 100644 --- a/tests/unit/browsers/test_playwright_browser_controller.py +++ b/tests/unit/browsers/test_playwright_browser_controller.py @@ -2,12 +2,13 @@ import asyncio from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock import pytest from playwright.async_api import Browser, Playwright, async_playwright -from crawlee.browsers import PlaywrightBrowserController +from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -106,3 +107,28 @@ async def test_close_browser_with_open_pages(browser: Browser) -> None: assert controller.pages_count == 0 assert not controller.is_browser_connected + + +async def test_memory_leak_on_concurrent_context_creation() -> None: + """Test that only one browser context is created when multiple pages are opened concurrently.""" + + # Prepare mocked browser with relevant methods and attributes + mocked_browser = AsyncMock() + mocked_context_launcher = AsyncMock() + + async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any: + """Ensure that both calls to create context overlap in time.""" + await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens + return await mocked_context_launcher(*args, **kwargs) + + mocked_browser.launch_persistent_context = delayed_launch_persistent_context + + # Create minimal instance of PlaywrightBrowserController with mocked browser + controller = PlaywrightBrowserController( + PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None + ) + + # Both calls will try to create browser context at the same time, but only one context should be created. + await asyncio.gather(controller.new_page(), controller.new_page()) + + assert mocked_context_launcher.call_count == 1