Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from asyncio import Lock
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

Expand Down Expand Up @@ -77,6 +78,19 @@ def __init__(

self._total_opened_pages = 0

self._context_creation_lock: Lock | None = None

async def _get_context_creation_lock(self) -> Lock:
"""Get context checking and creation lock.

It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
"""
if self._context_creation_lock:
return self._context_creation_lock
self._context_creation_lock = Lock()
return self._context_creation_lock

@property
@override
def pages(self) -> list[Page]:
Expand Down Expand Up @@ -137,12 +151,6 @@ async def new_page(
Raises:
ValueError: If the browser has reached the maximum number of open pages.
"""
if not self._browser_context:
self._browser_context = await self._create_browser_context(
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')

Expand All @@ -154,11 +162,12 @@ async def new_page(
)
page = await new_context.new_page()
else:
if not self._browser_context:
self._browser_context = await self._create_browser_context(
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
)
async with await self._get_context_creation_lock():
if not self._browser_context:
self._browser_context = await self._create_browser_context(
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
)
page = await self._browser_context.new_page()

# Handle page close event
Expand All @@ -169,7 +178,6 @@ async def new_page(
self._last_page_opened_at = datetime.now(timezone.utc)

self._total_opened_pages += 1

return page

@override
Expand Down Expand Up @@ -206,7 +214,6 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

if proxy_info:
if browser_new_context_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
Expand Down Expand Up @@ -244,5 +251,4 @@ async def _create_browser_context(
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)

return await self._browser.new_context(**browser_new_context_options)
30 changes: 28 additions & 2 deletions tests/unit/browsers/test_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import asyncio
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from unittest.mock import AsyncMock

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController
from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser

if TYPE_CHECKING:
from collections.abc import AsyncGenerator
Expand Down Expand Up @@ -106,3 +107,28 @@ async def test_close_browser_with_open_pages(browser: Browser) -> None:

assert controller.pages_count == 0
assert not controller.is_browser_connected


async def test_memory_leak_on_concurrent_context_creation() -> None:
"""Test that only one browser context is created when multiple pages are opened concurrently."""

# Prepare mocked browser with relevant methods and attributes
mocked_browser = AsyncMock()
mocked_context_launcher = AsyncMock()

async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any:
"""Ensure that both calls to create context overlap in time."""
await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens
return await mocked_context_launcher(*args, **kwargs)

mocked_browser.launch_persistent_context = delayed_launch_persistent_context

# Create minimal instance of PlaywrightBrowserController with mocked browser
controller = PlaywrightBrowserController(
PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None
)

# Both calls will try to create browser context at the same time, but only one context should be created.
await asyncio.gather(controller.new_page(), controller.new_page())

assert mocked_context_launcher.call_count == 1
Loading