Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add puppeteer util to block ads, trackers, and annoyances #600

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"dependencies": {
"@apify/http-request": "^1.1.5",
"@apify/ps-tree": "^1.1.3",
"@cliqz/adblocker-puppeteer": "^1.9.0",
"@types/cheerio": "^0.22.15",
"@types/node": "^13.1.8",
"@types/puppeteer": "^2.0.0",
Expand All @@ -61,6 +62,7 @@
"apify-shared": "^0.1.66",
"cheerio": "^1.0.0-rc.3",
"content-type": "^1.0.4",
"cross-fetch": "^3.0.4",
"express": "^4.17.1",
"fs-extra": "^8.1.0",
"htmlparser2": "^3.10.1",
Expand Down
104 changes: 104 additions & 0 deletions src/puppeteer_utils.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import path from 'path';
import fs from 'fs';
import vm from 'vm';
import util from 'util';
Expand All @@ -7,6 +8,8 @@ import { checkParamOrThrow } from 'apify-client/build/utils';
import { checkParamPrototypeOrThrow } from 'apify-shared/utilities';
import LruCache from 'apify-shared/lru_cache';
import { Page, Response } from 'puppeteer'; // eslint-disable-line no-unused-vars
import { PuppeteerBlocker, adsList, adsAndTrackingLists, fullLists } from '@cliqz/adblocker-puppeteer';
import fetch from 'cross-fetch';

import { RequestQueue, RequestQueueLocal } from './request_queue';
import Request from './request';
Expand All @@ -18,6 +21,7 @@ import { openKeyValueStore } from './key_value_store';
const jqueryPath = require.resolve('jquery/dist/jquery.min');
const underscorePath = require.resolve('underscore/underscore-min');
const readFilePromised = util.promisify(fs.readFile);
const writeFilePromised = util.promisify(fs.writeFile);

const MAX_INJECT_FILE_CACHE_SIZE = 10;
const DEFAULT_BLOCK_REQUEST_URL_PATTERNS = ['.css', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'];
Expand Down Expand Up @@ -274,6 +278,105 @@ const blockResources = async (page, resourceTypes = ['stylesheet', 'font', 'imag
});
};

/**
* Lazily initialized the first time `blockAdsAndTrackers()` is called, then
* can be re-used for any number of pages.
*/
let ADBLOCK_ENGINE;

/**
* Forces the Puppeteer browser tab to block all advertising requests. This is
* useful to speed up crawling of websites, since it reduces the amount of data
* that needs to be downloaded from the web, as well as the amount of
* JavaScript which needs to run in a given page.
*
* By default, the function will block all ads as defined by rules from the
* following widely-used subscriptions: Easylist, uBlock Origin filters, and
* Peter Lowe serverlist.
*
* Additionally, blocking of trackers can be enabled using the `blockTrackers`
* option and blocking extra annoyances (e.g. cookie notice banners, etc.) with
* the `blockAnnoyances` option. This will load extra rules from subscriptions:
* EasyPrivacy, uBlock Origin privacy list, Easylist Cookies, etc..
*
* The function will never block main document loads and their respective redirects.
*
* **Example usage**
* ```javascript
* const Apify = require('apify');
*
* const browser = await Apify.launchPuppeteer();
* const page = await browser.newPage();
*
* await Apify.utils.puppeteer.blockAdsAndTrackers(page, {
* blockTrackers: true,
* // blockAnnoyances: true,
* });
*
* await page.goto('https://cnn.com');
* ```
*
* @param {Page} page
* Puppeteer <a href="https://pptr.dev/#?product=Puppeteer&show=api-class-page" target="_blank"><code>Page</code></a> object.
* @param {Object} [options]
* @param {Boolean} [options.blockTrackers=false] Enable blocking of trackers.
* @param {Boolean} [options.blockAnnoyances=false] Enable blocking of annoyances.
* @return {Promise}
* @memberOf puppeteer
*/
const blockAdsAndTrackers = async (page, {
blockTrackers = false,
blockAnnoyances = false,
} = {}) => {
// Initialize adblocker engine on the first call to `blockAdsAndTrackers()`,
// it can then be re-used for other pages. This means that the amount of
// memory required to operate is constant, as only one instance needs to
// exist in memory at a given point.
if (ADBLOCK_ENGINE === undefined) {
let lists = adsList;
let name = 'ads';

if (blockTrackers) {
lists = adsAndTrackingLists;
name = 'trackers';
}

// This takes priority over `blockTrackers` option, so this test should
// be done last; this is because `fullLists` already includes everything
// from `adsAndTrackingLists`.
if (blockAnnoyances) {
lists = fullLists;
name = 'annoyances';
}

// This allows the adblocker engine to be serialized on disk to allow
// very fast initialization for sub-sequent runs (this also means that
// no network request has to be performed and the adblocker can be
// ready in <100ms).
const caching = {
path: path.join(__dirname, `engine-${name}.bin`),
read: readFilePromised,
write: writeFilePromised,
};

// Initialize engine, either from cache (if already available on disk),
// by fetching rules from `lists` and parsing them to initialize the
// engine.
ADBLOCK_ENGINE = await PuppeteerBlocker.fromLists(
fetch,
lists,
{ enableCompression: true },
caching,
);
}

page.on('framenavigated', frame => ADBLOCK_ENGINE.onFrameNavigated(frame));

await addInterceptRequestHandler(page, (request) => {
ADBLOCK_ENGINE.onRequest(request);
});
};

/**
* *NOTE:* In recent versions of Puppeteer using this function entirely disables browser cache which resolves in sub-optimal
* performance. Until this resolves, we suggest just relying on the in-browser cache unless absolutely necessary.
Expand Down Expand Up @@ -598,6 +701,7 @@ export const puppeteerUtils = {
enqueueLinksByClickingElements,
blockRequests,
blockResources,
blockAdsAndTrackers,
cacheResponses,
compileScript,
gotoExtended,
Expand Down