Skip to content

Commit

Permalink
feat: add closeCookieModals context helper for Playwright and Puppe…
Browse files Browse the repository at this point in the history
…teer (#1927)

Still somewhat WIP:
- You have to detect the modal yourself (and call the helper). For user
comfort, it would probably be better if we did this somehow
automatically.
- We have no data about the performance impact (of injecting 1.3MB JS
file). Most of the script should get GC-ed immediately after running it,
but who knows.
  • Loading branch information
barjin committed Jul 28, 2023
1 parent cbd9d08 commit 98d93bb
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 322 deletions.
1 change: 1 addition & 0 deletions packages/playwright-crawler/package.json
Expand Up @@ -60,6 +60,7 @@
"@crawlee/types": "^3.4.2",
"@crawlee/utils": "^3.4.2",
"cheerio": "^1.0.0-rc.12",
"idcac-playwright": "^0.1.2",
"jquery": "^3.6.0",
"ow": "^0.28.1",
"tslib": "^2.4.0"
Expand Down
Expand Up @@ -28,6 +28,7 @@ import { validators, KeyValueStore, RequestState } from '@crawlee/browser';
import type { BatchAddRequestsResult } from '@crawlee/types';
import type { CheerioRoot, Dictionary } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import { getInjectableScript as getCookieClosingScript } from 'idcac-playwright';
import ow from 'ow';
import type { Page, Response, Route } from 'playwright';

Expand Down Expand Up @@ -558,6 +559,12 @@ export async function parseWithCheerio(page: Page): Promise<CheerioRoot> {
return cheerio.load(pageContent);
}

export async function closeCookieModals(page: Page): Promise<void> {
ow(page, ow.object.validate(validators.browserPage));

await page.evaluate(getCookieClosingScript());
}

/** @internal */
export interface PlaywrightContextUtils {
/**
Expand Down Expand Up @@ -729,6 +736,11 @@ export interface PlaywrightContextUtils {
* secured copies beforehand.
*/
compileScript(scriptString: string, ctx?: Dictionary): CompiledScriptFunction;

/**
* Tries to close cookie consent modals on the page. Based on the I Don't Care About Cookies browser extension.
*/
closeCookieModals(): Promise<void>;
}

export function registerUtilsToContext(context: PlaywrightCrawlingContext): void {
Expand All @@ -751,6 +763,7 @@ export function registerUtilsToContext(context: PlaywrightCrawlingContext): void
requestQueue: context.crawler.requestQueue!,
});
context.compileScript = (scriptString: string, ctx?: Dictionary) => compileScript(scriptString, ctx);
context.closeCookieModals = () => closeCookieModals(context.page);
}

export { enqueueLinksByClickingElements };
Expand All @@ -766,4 +779,5 @@ export const playwrightUtils = {
infiniteScroll,
saveSnapshot,
compileScript,
closeCookieModals,
};
1 change: 1 addition & 0 deletions packages/puppeteer-crawler/package.json
Expand Up @@ -61,6 +61,7 @@
"@crawlee/utils": "^3.4.2",
"cheerio": "^1.0.0-rc.12",
"devtools-protocol": "*",
"idcac-playwright": "^0.1.2",
"jquery": "^3.6.0",
"ow": "^0.28.1",
"tslib": "^2.4.0"
Expand Down
12 changes: 12 additions & 0 deletions packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts
Expand Up @@ -29,6 +29,7 @@ import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
import type { CheerioRoot } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { ProtocolMapping } from 'devtools-protocol/types/protocol-mapping.js';
import { getInjectableScript } from 'idcac-playwright';
import ow from 'ow';
import type { Page, HTTPResponse, ResponseForRequest, HTTPRequest as PuppeteerRequest } from 'puppeteer';

Expand Down Expand Up @@ -673,6 +674,10 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}
}
}

export async function closeCookieModals(page: Page): Promise<void> {
await page.evaluate(getInjectableScript());
}

/** @internal */
export interface PuppeteerContextUtils {
/**
Expand Down Expand Up @@ -924,6 +929,11 @@ export interface PuppeteerContextUtils {
* Saves a full screenshot and HTML of the current page into a Key-Value store.
*/
saveSnapshot(options?: SaveSnapshotOptions): Promise<void>;

/**
* Tries to close cookie consent modals on the page. Based on the I Don't Care About Cookies browser extension.
*/
closeCookieModals(): Promise<void>;
}

/** @internal */
Expand Down Expand Up @@ -953,6 +963,7 @@ export function registerUtilsToContext(context: PuppeteerCrawlingContext): void
context.removeInterceptRequestHandler = (handler: InterceptHandler) => removeInterceptRequestHandler(context.page, handler);
context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options);
context.saveSnapshot = (options?: SaveSnapshotOptions) => saveSnapshot(context.page, options);
context.closeCookieModals = () => closeCookieModals(context.page);
}

export {
Expand All @@ -976,4 +987,5 @@ export const puppeteerUtils = {
infiniteScroll,
saveSnapshot,
parseWithCheerio,
closeCookieModals,
};
3 changes: 2 additions & 1 deletion test/e2e/playwright-enqueue-links/actor/main.js
Expand Up @@ -11,12 +11,13 @@ const mainOptions = {
await Actor.main(async () => {
const crawler = new PlaywrightCrawler({
maxRequestsPerCrawl: 30,
requestHandler: async ({ page, request, enqueueLinks }) => {
requestHandler: async ({ page, request, enqueueLinks, closeCookieModals }) => {
const { url, loadedUrl } = request;

const pageTitle = await page.title();
log.info(`URL: ${url}; LOADED_URL: ${loadedUrl}; TITLE: ${pageTitle}`);

await closeCookieModals();
// Wait for the actor cards to render,
// otherwise enqueueLinks wouldn't enqueue anything.
await page.waitForSelector('.ActorStorePagination-buttons a');
Expand Down
4 changes: 3 additions & 1 deletion test/e2e/puppeteer-enqueue-links/actor/main.js
Expand Up @@ -10,12 +10,14 @@ const mainOptions = {
await Actor.main(async () => {
const crawler = new PuppeteerCrawler({
maxRequestsPerCrawl: 30,
async requestHandler({ page, enqueueLinks, request, log }) {
async requestHandler({ page, enqueueLinks, request, log, closeCookieModals }) {
const { url, loadedUrl } = request;

const pageTitle = await page.title();
log.info(`URL: ${url}; LOADED_URL: ${loadedUrl}; TITLE: ${pageTitle}`);

await closeCookieModals();

const results = await enqueueLinks();

if (loadedUrl.startsWith('https://drive')) {
Expand Down

0 comments on commit 98d93bb

Please sign in to comment.