-
Notifications
You must be signed in to change notification settings - Fork 1.4k
refactor!: Introduce IBrowserPool interface and utilize it in BrowserCrawler #3669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: v4
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,11 +32,12 @@ import type { | |
| BrowserPoolHooks, | ||
| BrowserPoolOptions, | ||
| CommonPage, | ||
| IBrowserController, | ||
| InferBrowserPluginArray, | ||
| LaunchContext, | ||
| } from '@crawlee/browser-pool'; | ||
| import { BrowserPool } from '@crawlee/browser-pool'; | ||
| import type { BatchAddRequestsResult, Cookie as CookieObject } from '@crawlee/types'; | ||
| import type { BatchAddRequestsResult, Cookie as CookieObject, IBrowserPool } from '@crawlee/types'; | ||
| import type { RobotsTxtFile } from '@crawlee/utils'; | ||
| import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils'; | ||
| import ow from 'ow'; | ||
|
|
@@ -55,13 +56,11 @@ type ContextDifference<T, U> = Omit<U, keyof T> & Partial<U>; | |
| export interface BrowserCrawlingContext< | ||
| Page extends CommonPage = CommonPage, | ||
| Response extends BaseResponse = BaseResponse, | ||
| ProvidedController = BrowserController, | ||
| ProvidedController extends IBrowserController = IBrowserController, | ||
| UserData extends Dictionary = Dictionary, | ||
| > extends CrawlingContext<UserData> { | ||
| /** | ||
| * An instance of the {@apilink BrowserController} that manages the browser instance and provides access to its API. | ||
| */ | ||
| browserController: ProvidedController; | ||
| /** @internal Anchors the `ProvidedController` type parameter for subclass use. */ | ||
| readonly __controllerType?: ProvidedController; | ||
|
|
||
| /** | ||
| * The browser page object where the web page is loaded and rendered. | ||
|
|
@@ -92,7 +91,7 @@ export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Di | |
| export interface BrowserCrawlerOptions< | ||
| Page extends CommonPage = CommonPage, | ||
| Response extends BaseResponse = BaseResponse, | ||
| ProvidedController extends BrowserController = BrowserController, | ||
| ProvidedController extends IBrowserController = IBrowserController, | ||
| Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext< | ||
| Page, | ||
| Response, | ||
|
|
@@ -112,6 +111,13 @@ export interface BrowserCrawlerOptions< | |
| > { | ||
| launchContext?: BrowserLaunchContext<any, any>; | ||
|
|
||
| /** | ||
| * An existing browser pool instance to use. When provided, the crawler will use this pool directly instead of | ||
| * constructing a new one from `browserPoolOptions`, enabling browser sharing across multiple crawlers. The crawler | ||
| * will not tear down a shared pool — the caller is responsible for its lifecycle. | ||
| */ | ||
| browserPool?: IBrowserPool<Page>; | ||
|
|
||
| /** | ||
| * Function that is called to process each request. | ||
| * | ||
|
|
@@ -122,7 +128,6 @@ export interface BrowserCrawlerOptions< | |
| * - {@apilink BrowserCrawlingContext.page|`page`} is an instance of the | ||
| * Puppeteer [Page](https://pptr.dev/api/puppeteer.page) or | ||
| * Playwright [Page](https://playwright.dev/docs/api/class-page); | ||
| * - {@apilink BrowserCrawlingContext.browserController|`browserController`} is an instance of the {@apilink BrowserController}; | ||
| * - {@apilink BrowserCrawlingContext.response|`response`} is an instance of the | ||
| * Puppeteer [Response](https://pptr.dev/api/puppeteer.httpresponse) or | ||
| * Playwright [Response](https://playwright.dev/docs/api/class-response), | ||
|
|
@@ -284,7 +289,7 @@ export interface BrowserCrawlerOptions< | |
| export abstract class BrowserCrawler< | ||
| Page extends CommonPage = CommonPage, | ||
| Response extends BaseResponse = BaseResponse, | ||
| ProvidedController extends BrowserController = BrowserController, | ||
| ProvidedController extends IBrowserController = IBrowserController, | ||
| InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, | ||
| LaunchOptions extends Dictionary | undefined = Dictionary, | ||
| Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext< | ||
|
|
@@ -298,9 +303,18 @@ export abstract class BrowserCrawler< | |
| GoToOptions extends Dictionary = Dictionary, | ||
| > extends BasicCrawler<Context, ContextExtension, ExtendedContext> { | ||
| /** | ||
| * A reference to the underlying {@apilink BrowserPool} class that manages the crawler's browsers. | ||
| * A reference to the underlying browser pool that manages the crawler's browsers. Typed as | ||
| * {@apilink IBrowserPool} so custom implementations can be plugged in via the `browserPool` constructor option. | ||
| */ | ||
| browserPool: IBrowserPool<Page>; | ||
|
|
||
| /** | ||
| * Set when the crawler constructed its own {@apilink BrowserPool} (no `browserPool` option was provided). | ||
| * Holds the same instance as `browserPool`, but typed as the concrete class so the crawler can call | ||
| * lifecycle methods (`destroy`) that aren't part of {@apilink IBrowserPool}. A user-supplied pool is | ||
| * never owned and never torn down by the crawler. | ||
| */ | ||
| browserPool: BrowserPool<InternalBrowserPoolOptions>; | ||
| private ownedBrowserPool?: BrowserPool<InternalBrowserPoolOptions>; | ||
|
|
||
| launchContext: BrowserLaunchContext<LaunchOptions, unknown>; | ||
|
|
||
|
|
@@ -321,7 +335,8 @@ export abstract class BrowserCrawler< | |
|
|
||
| launchContext: ow.optional.object, | ||
| headless: ow.optional.any(ow.boolean, ow.string), | ||
| browserPoolOptions: ow.object, | ||
| browserPool: ow.optional.object.validate(validators.browserPool), | ||
| browserPoolOptions: ow.optional.object, | ||
| saveResponseCookies: ow.optional.boolean, | ||
| proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration), | ||
| }; | ||
|
|
@@ -346,6 +361,7 @@ export abstract class BrowserCrawler< | |
| navigationTimeoutSecs = 60, | ||
| saveResponseCookies = true, | ||
| launchContext = {}, | ||
| browserPool, | ||
| browserPoolOptions, | ||
| preNavigationHooks = [], | ||
| postNavigationHooks = [], | ||
|
|
@@ -381,15 +397,23 @@ export abstract class BrowserCrawler< | |
|
|
||
| this.saveResponseCookies = saveResponseCookies; | ||
|
|
||
| if (browserPool) { | ||
| this.browserPool = browserPool; | ||
| return; | ||
| } | ||
|
|
||
| const resolvedBrowserPoolOptions = browserPoolOptions ?? ({} as Partial<BrowserPoolOptions>); | ||
|
|
||
| if (launchContext?.userAgent) { | ||
| if (browserPoolOptions.useFingerprints) | ||
| if (resolvedBrowserPoolOptions.useFingerprints) | ||
| this.log.info('Custom user agent provided, disabling automatic browser fingerprint injection!'); | ||
| browserPoolOptions.useFingerprints = false; | ||
| resolvedBrowserPoolOptions.useFingerprints = false; | ||
| } | ||
|
|
||
| this.browserPool = new BrowserPool<InternalBrowserPoolOptions>({ | ||
| ...(browserPoolOptions as any), | ||
| this.ownedBrowserPool = new BrowserPool<InternalBrowserPoolOptions>({ | ||
| ...(resolvedBrowserPoolOptions as any), | ||
| }); | ||
| this.browserPool = this.ownedBrowserPool as IBrowserPool<Page>; | ||
| } | ||
|
|
||
| protected override buildContextPipeline(): ContextPipeline< | ||
|
|
@@ -401,23 +425,18 @@ export abstract class BrowserCrawler< | |
| cleanup: async (context: { | ||
| page: Page; | ||
| session: Session; | ||
| browserController: ProvidedController; | ||
| registerDeferredCleanup: BasicCrawlingContext['registerDeferredCleanup']; | ||
| }) => { | ||
| context.registerDeferredCleanup(async () => { | ||
| // In non-incognito mode the browser controller carries the session's cookies | ||
| // and storage across pages. If the session is no longer usable, retire the | ||
| // controller so its state can't leak into whichever session lands on it next. | ||
| if (!context.session.isUsable()) { | ||
| this.browserPool.retireBrowserController( | ||
| context.browserController as Parameters< | ||
| BrowserPool<InternalBrowserPoolOptions>['retireBrowserController'] | ||
| >[0], | ||
| const error = !context.session.isUsable() | ||
| ? new SessionError('Session is no longer usable') | ||
| : undefined; | ||
|
|
||
| await this.browserPool | ||
| .closePage(context.page, { error }) | ||
| .catch((closeError: Error) => | ||
| this.log.debug('Error while closing page', { error: closeError }), | ||
| ); | ||
| } | ||
| await context.page | ||
| .close() | ||
| .catch((error: Error) => this.log.debug('Error while closing page', { error })); | ||
| }); | ||
| }, | ||
| }); | ||
|
|
@@ -457,29 +476,38 @@ export abstract class BrowserCrawler< | |
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Resolves the {@apilink IBrowserController} that owns the given page, or | ||
| * `undefined` when the pool does not expose controllers (custom | ||
| * {@apilink IBrowserPool} implementations). | ||
| * | ||
| * This is intentionally **not** part of the public crawling context — it is | ||
| * an implementation detail used internally for cookie injection and by | ||
| * subclasses that need direct controller access (e.g. StagehandCrawler). | ||
| */ | ||
| protected _getBrowserControllerByPage(page: Page): ProvidedController | undefined { | ||
| if ('getBrowserControllerByPage' in this.browserPool) { | ||
| return ( | ||
| this.browserPool as unknown as { | ||
| getBrowserControllerByPage(page: Page): ProvidedController | undefined; | ||
| } | ||
| ).getBrowserControllerByPage(page); | ||
| } | ||
|
|
||
| return undefined; | ||
| } | ||
|
|
||
| private async preparePage( | ||
| crawlingContext: CrawlingContext, | ||
| ): Promise< | ||
| ContextDifference<CrawlingContext, BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>> | ||
| > { | ||
| const newPageOptions: Dictionary = { | ||
| const page = await this.browserPool.newPage({ | ||
| id: crawlingContext.id, | ||
| }; | ||
|
|
||
| if (crawlingContext.session?.proxyInfo) { | ||
| const proxyInfo = crawlingContext.session.proxyInfo; | ||
|
|
||
| newPageOptions.proxyUrl = proxyInfo?.url; | ||
| newPageOptions.ignoreTlsErrors = proxyInfo?.ignoreTlsErrors; | ||
| } | ||
|
|
||
| const page = (await this.browserPool.newPage(newPageOptions)) as Page; | ||
| session: crawlingContext.session, | ||
| }); | ||
| tryCancel(); | ||
|
|
||
| const browserControllerInstance = this.browserPool.getBrowserControllerByPage( | ||
| page as any, | ||
| ) as ProvidedController; | ||
|
|
||
| const contextEnqueueLinks = crawlingContext.enqueueLinks; | ||
|
|
||
| return { | ||
|
|
@@ -489,7 +517,6 @@ export abstract class BrowserCrawler< | |
| "The `response` property is not available. This might mean that you're trying to access it before navigation or that navigation resulted in `null` (this should only happen with `about:` URLs)", | ||
| ); | ||
| }, | ||
| browserController: browserControllerInstance, | ||
| enqueueLinks: async (enqueueOptions: EnqueueLinksOptions = {}) => { | ||
| return (await browserCrawlerEnqueueLinks({ | ||
| options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) }, | ||
|
|
@@ -564,9 +591,13 @@ export abstract class BrowserCrawler< | |
| // save cookies | ||
| // TODO: Should we save the cookies also after/only the handle page? | ||
| if (this.saveResponseCookies) { | ||
| const cookies = await crawlingContext.browserController.getCookies(crawlingContext.page); | ||
| tryCancel(); | ||
| crawlingContext.session?.setCookies(cookies, crawlingContext.request.loadedUrl!); | ||
| const controller = this._getBrowserControllerByPage(crawlingContext.page); | ||
|
|
||
| if (controller) { | ||
| const cookies = await controller.getCookies(crawlingContext.page); | ||
| tryCancel(); | ||
| crawlingContext.session?.setCookies(cookies, crawlingContext.request.loadedUrl!); | ||
| } | ||
|
Comment on lines
+594
to
+600
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably the last legit use case for accessing the browser controller. How should we back-propagate the cookies to the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'd rather not go that way, as it sounds quite opaque and I'm not too keen on the async "watcher" pattern in JS. Perhaps we could access |
||
| } | ||
|
|
||
| if (response !== undefined) { | ||
|
|
@@ -598,15 +629,18 @@ export abstract class BrowserCrawler< | |
| } | ||
|
|
||
| protected async _applyCookies( | ||
| { session, request, page, browserController }: BrowserCrawlingContext, | ||
| { session, request, page }: BrowserCrawlingContext, | ||
| preHooksCookies: string, | ||
| postHooksCookies: string, | ||
| ) { | ||
| const controller = this._getBrowserControllerByPage(page as Page); | ||
| if (!controller) return; | ||
|
|
||
| const sessionCookie = session?.getCookies(request.url) ?? []; | ||
| const parsedPreHooksCookies = preHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c)); | ||
| const parsedPostHooksCookies = postHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c)); | ||
|
|
||
| await browserController.setCookies( | ||
| await controller.setCookies( | ||
| page, | ||
| [...sessionCookie, ...parsedPreHooksCookies, ...parsedPostHooksCookies] | ||
| .filter((c): c is CookieObject => typeof c !== 'undefined' && c !== null) | ||
|
|
@@ -677,7 +711,7 @@ export abstract class BrowserCrawler< | |
| * @ignore | ||
| */ | ||
| override async teardown(): Promise<void> { | ||
| await this.browserPool.destroy(); | ||
| await this.ownedBrowserPool?.destroy(); | ||
| await super.teardown(); | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left here for https://github.com/apify/crawlee/pull/3669/changes#r3319371997 and
StagehandCrawler. After we get rid of these dependencies, we can probably remove this, and theProvidedControllertype parameter.