Skip to content

Commit

Permalink
feat: add iframe expansion to parseWithCheerio in browsers (#2542)
Browse files Browse the repository at this point in the history
Replaces the `iframe` elements with their contents in `<div
class="crawlee-iframe-replacement"></div>` element.

Closes #2507
  • Loading branch information
barjin committed Jun 20, 2024
1 parent 39cf673 commit 328d085
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 0 deletions.
2 changes: 2 additions & 0 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ export abstract class BrowserCrawler<
persistCookiesPerSession: ow.optional.boolean,
useSessionPool: ow.optional.boolean,
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
ignoreShadowRoots: ow.optional.boolean,
};

/**
Expand Down Expand Up @@ -370,6 +371,7 @@ export abstract class BrowserCrawler<
failedRequestHandler,
handleFailedRequestFunction,
headless,
ignoreShadowRoots,
...basicCrawlerOptions
} = options;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,28 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}
export async function parseWithCheerio(page: Page, ignoreShadowRoots = false): Promise<CheerioRoot> {
ow(page, ow.object.validate(validators.browserPage));

if (page.frames().length > 1) {
const frames = await page.$$('iframe');

await Promise.all(
frames.map(async (frame) => {
const iframe = await frame.contentFrame();

if (iframe) {
const contents = await iframe.content();

await frame.evaluate((f, c) => {
const replacementNode = document.createElement('div');
replacementNode.innerHTML = c;
replacementNode.className = 'crawlee-iframe-replacement';

f.replaceWith(replacementNode);
}, contents);
}
}),
);
}

const html = ignoreShadowRoots
? null
: ((await page.evaluate(`(${expandShadowRoots.toString()})(document)`)) as string);
Expand Down
22 changes: 22 additions & 0 deletions packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,28 @@ export async function injectJQuery(page: Page, options?: { surviveNavigations?:
export async function parseWithCheerio(page: Page, ignoreShadowRoots = false): Promise<CheerioRoot> {
ow(page, ow.object.validate(validators.browserPage));

if (page.frames().length > 1) {
const frames = await page.$$('iframe');

await Promise.all(
frames.map(async (frame) => {
const iframe = await frame.contentFrame();

if (iframe) {
const contents = await iframe.content();

await frame.evaluate((f, c) => {
const replacementNode = document.createElement('div');
replacementNode.innerHTML = c;
replacementNode.className = 'crawlee-iframe-replacement';

f.replaceWith(replacementNode);
}, contents);
}
}),
);
}

const html = ignoreShadowRoots
? null
: ((await page.evaluate(`(${expandShadowRoots.toString()})(document)`)) as string);
Expand Down
18 changes: 18 additions & 0 deletions test/core/playwright_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,24 @@ describe('playwrightUtils', () => {
}
});

test('parseWithCheerio() iframe expansion works', async () => {
const browser = await launchPlaywright(launchContext);

try {
const page = await browser.newPage();
await page.goto(new URL('/special/outside-iframe', serverAddress).toString());

const $ = await playwrightUtils.parseWithCheerio(page);

const headings = $('h1')
.map((i, el) => $(el).text())
.get();
expect(headings).toEqual(['Outside iframe', 'In iframe']);
} finally {
await browser.close();
}
});

describe('blockRequests()', () => {
let browser: Browser = null;
beforeAll(async () => {
Expand Down
18 changes: 18 additions & 0 deletions test/core/puppeteer_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,24 @@ describe('puppeteerUtils', () => {
}
});

test('parseWithCheerio() iframe expansion works', async () => {
const browser = await launchPuppeteer(launchContext);

try {
const page = await browser.newPage();
await page.goto(new URL('/special/outside-iframe', serverAddress).toString());

const $ = await puppeteerUtils.parseWithCheerio(page);

const headings = $('h1')
.map((i, el) => $(el).text())
.get();
expect(headings).toEqual(['Outside iframe', 'In iframe']);
} finally {
await browser.close();
}
});

describe('blockRequests()', () => {
let browser: Browser = null;
beforeAll(async () => {
Expand Down
30 changes: 30 additions & 0 deletions test/shared/_helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,28 @@ console.log('Hello world!');
</div>
</body>
</html>`,
outsideIframe: `
<!DOCTYPE html>
<html>
<head>
<title>Outside iframe</title>
</head>
<body>
<h1>Outside iframe</h1>
<iframe src="./inside-iframe"></iframe>
</body>
</html>`,
insideIframe: `
<!DOCTYPE html>
<html>
<head>
<title>In iframe</title>
</head>
<body>
<h1>In iframe</h1>
<p>Some content from inside of an iframe.</p>
</body>
</html>`,
};

export async function runExampleComServer(): Promise<[Server, number]> {
Expand Down Expand Up @@ -268,6 +290,14 @@ export async function runExampleComServer(): Promise<[Server, number]> {
special.get('/cloudflareBlocking', async (_req, res) => {
res.type('html').status(403).send(responseSamples.cloudflareBlocking);
});

special.get('/outside-iframe', (_req, res) => {
res.type('html').send(responseSamples.outsideIframe);
});

special.get('/inside-iframe', (_req, res) => {
res.type('html').send(responseSamples.insideIframe);
});
})();

// "cacheable" site with one page, scripts and stylesheets
Expand Down

0 comments on commit 328d085

Please sign in to comment.