Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: playwrightUtils.infiniteScroll #1543

Merged
merged 2 commits into from
Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions packages/playwright-crawler/src/internals/utils/playwright-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,142 @@ export async function blockRequests(page: Page, options: BlockRequestsOptions =
await client.send('Network.setBlockedURLs', { urls: patternsToBlock });
}

export interface InfiniteScrollOptions {
/**
* How many seconds to scroll for. If 0, will scroll until bottom of page.
* @default 1
*/
timeoutSecs?: number;

/**
* How many seconds to wait for no new content to load before exit.
* @default 4
*/
waitForSecs?: number;

/**
* If true, it will scroll up a bit after each scroll down. This is required on some websites for the scroll to work.
* @default false
*/
scrollDownAndUp?: boolean;

/**
* Optionally checks and clicks a button if it appears while scrolling. This is required on some websites for the scroll to work.
*/
buttonSelector?: string;

/**
* This function is called after every scroll and stops the scrolling process if it returns `true`. The function can be `async`.
*/
stopScrollCallback?: () => unknown | Promise<unknown>;
}

/**
* Scrolls to the bottom of a page, or until it times out.
* Loads dynamic content when it hits the bottom of a page, and then continues scrolling.
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options]
*/
export async function infiniteScroll(page: Page, options: InfiniteScrollOptions = {}): Promise<void> {
ow(page, ow.object.validate(validators.browserPage));
ow(options, ow.object.exactShape({
timeoutSecs: ow.optional.number,
waitForSecs: ow.optional.number,
scrollDownAndUp: ow.optional.boolean,
buttonSelector: ow.optional.string,
stopScrollCallback: ow.optional.function,
}));

const { timeoutSecs = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback } = options;

let finished;
const startTime = Date.now();
const CHECK_INTERVAL_MILLIS = 1000;
const SCROLL_HEIGHT_IF_ZERO = 10000;
const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
const resourcesStats = {
newRequested: 0,
oldRequested: 0,
matchNumber: 0,
};

page.on('request', (msg) => {
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType())) {
resourcesStats.newRequested++;
}
});

// Move mouse to the center of the page, so we can scroll up-down
let body = await page.$('body');

for (let retry = 0; retry < 10; retry++) {
if (body) break;
await page.waitForTimeout(100);
body = await page.$('body');
}

if (!body) {
return;
}

const boundingBox = await body.boundingBox();
await page.mouse.move(
boundingBox!.x + boundingBox!.width / 2,
boundingBox!.y + boundingBox!.height / 2,
);

const checkFinished = setInterval(() => {
if (resourcesStats.oldRequested === resourcesStats.newRequested) {
resourcesStats.matchNumber++;
if (resourcesStats.matchNumber >= waitForSecs) {
clearInterval(checkFinished);
finished = true;
return;
}
} else {
resourcesStats.matchNumber = 0;
resourcesStats.oldRequested = resourcesStats.newRequested;
}
// check if timeout has been reached
if (timeoutSecs !== 0 && (Date.now() - startTime) / 1000 > timeoutSecs) {
clearInterval(checkFinished);
finished = true;
}
}, CHECK_INTERVAL_MILLIS);

const doScroll = async () => {
const bodyScrollHeight = await page.evaluate(() => document.body.scrollHeight);
const delta = bodyScrollHeight === 0 ? SCROLL_HEIGHT_IF_ZERO : bodyScrollHeight;

await page.mouse.wheel(0, delta);
};

const maybeClickButton = async () => {
const button = await page.$(buttonSelector!);
// Box model returns null if the button is not visible
if (button && await button.boundingBox()) {
await button.click({ delay: 10 });
}
};

while (!finished) {
await doScroll();
await page.waitForTimeout(250);
if (scrollDownAndUp) {
await page.mouse.wheel(0, -1000);
}
if (buttonSelector) {
await maybeClickButton();
}
if (stopScrollCallback) {
if (await stopScrollCallback()) {
clearInterval(checkFinished);
break;
}
}
}
}

/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
*
Expand All @@ -292,13 +428,15 @@ export interface PlaywrightContextUtils {
injectJQuery(): Promise<unknown>;
blockRequests(options?: BlockRequestsOptions): Promise<void>;
parseWithCheerio(): Promise<CheerioRoot>;
infiniteScroll(options?: InfiniteScrollOptions): Promise<void>;
}

export function registerUtilsToContext(context: PlaywrightCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.blockRequests = (options?: BlockRequestsOptions) => blockRequests(context.page, options);
context.parseWithCheerio = () => parseWithCheerio(context.page);
context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options);
}

/** @internal */
Expand All @@ -308,4 +446,5 @@ export const playwrightUtils = {
gotoExtended,
blockRequests,
parseWithCheerio,
infiniteScroll,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you also need to add it to the PlaywrightContextUtils interface (so it gets added to the crawling context type automatically) plus register it there in the registerUtilsToContext function, both just above this variable

};
56 changes: 55 additions & 1 deletion test/core/playwright_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import log from '@apify/log';
import type { Server } from 'http';
import type { AddressInfo } from 'net';
import { Request, launchPlaywright, playwrightUtils } from '@crawlee/playwright';
import type { Browser } from 'playwright';
import type { Browser, Page } from 'playwright';
import { chromium } from 'playwright';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator';
import { startExpressAppPromise } from '../shared/_helper';
Expand Down Expand Up @@ -244,5 +244,59 @@ describe('playwrightUtils', () => {
await browser.close();
}
}, 60_000);

describe('infiniteScroll()', () => {
function isAtBottom() {
return (window.innerHeight + window.pageYOffset) >= document.body.offsetHeight;
}

let browser: Browser;
beforeAll(async () => {
browser = await chromium.launch({ headless: true });
});
afterAll(async () => {
await browser.close();
});

let page: Page;
beforeEach(async () => {
page = await browser.newPage();
let count = 0;
const content = Array(1000).fill(null).map(() => {
return `<div style="border: 1px solid black">Div number: ${count++}</div>`;
});
const contentHTML = `<html><body>${content}</body></html>`;
await page.setContent(contentHTML);
});
afterEach(async () => {
await page.close();
});

test('works', async () => {
const before = await page.evaluate(isAtBottom);
expect(before).toBe(false);

await playwrightUtils.infiniteScroll(page, { waitForSecs: 0 });

const after = await page.evaluate(isAtBottom);
expect(after).toBe(true);
});

test('stopScrollCallback works', async () => {
const before = await page.evaluate(isAtBottom);
expect(before).toBe(false);

await playwrightUtils.infiniteScroll(page, {
waitForSecs: Infinity,
stopScrollCallback: async () => true,
});

const after = await page.evaluate(isAtBottom);
// It scrolls to the bottom in the first scroll so this is correct.
// The test passes because the Infinite waitForSecs is broken by the callback.
// If it didn't, the test would time out.
expect(after).toBe(true);
});
});
});
});