Skip to content

Commit

Permalink
feat: got-scraping v4 (#2110)
Browse files Browse the repository at this point in the history
  • Loading branch information
vladfrangu committed Nov 14, 2023
1 parent 3d37f8d commit 2f05ed2
Show file tree
Hide file tree
Showing 21 changed files with 127 additions and 181 deletions.
2 changes: 1 addition & 1 deletion .eslintrc.json
Expand Up @@ -8,7 +8,7 @@
"extends": "@apify/eslint-config-ts",
"parserOptions": {
"project": "./tsconfig.eslint.json",
"ecmaVersion": 2020
"ecmaVersion": 2022
},
"ignorePatterns": [
"node_modules",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -102,7 +102,7 @@
"rimraf": "^5.0.0",
"ts-node": "^10.9.1",
"turbo": "1.10.16",
"typescript": "^5.0.0",
"typescript": "^5.2.2",
"vite-tsconfig-paths": "^4.2.1",
"vitest": "^0.34.6"
},
Expand Down
2 changes: 1 addition & 1 deletion packages/basic-crawler/package.json
Expand Up @@ -51,7 +51,7 @@
"@crawlee/core": "3.5.8",
"@crawlee/types": "3.5.8",
"@crawlee/utils": "3.5.8",
"got-scraping": "^3.2.15",
"got-scraping": "^4.0.0",
"ow": "^0.28.1",
"tldts": "^6.0.0",
"tslib": "^2.4.0",
Expand Down
6 changes: 3 additions & 3 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -48,11 +48,11 @@ import {
validators,
} from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { ROTATE_PROXY_ERRORS, gotScraping } from '@crawlee/utils';
import { stringify } from 'csv-stringify/sync';
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
import type { Method, OptionsInit } from 'got-scraping';
import { gotScraping } from 'got-scraping';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { OptionsInit, Method } from 'got-scraping';
import ow, { ArgumentError } from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/crawlers/crawler_commons.ts
@@ -1,4 +1,5 @@
import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { Response as GotResponse, OptionsInit } from 'got-scraping';

import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links';
Expand Down
2 changes: 1 addition & 1 deletion packages/http-crawler/package.json
Expand Up @@ -61,7 +61,7 @@
"@types/content-type": "^1.1.5",
"cheerio": "^1.0.0-rc.12",
"content-type": "^1.0.4",
"got-scraping": "^3.2.15",
"got-scraping": "^4.0.0",
"iconv-lite": "^0.6.3",
"mime-types": "^2.1.35",
"ow": "^0.28.1",
Expand Down
19 changes: 14 additions & 5 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -28,17 +28,19 @@ import {
SessionError,
} from '@crawlee/basic';
import type { Awaitable, Dictionary } from '@crawlee/types';
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
import { RETRY_CSS_SELECTORS, gotScraping } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { RequestLike, ResponseLike } from 'content-type';
import contentTypeParser from 'content-type';
import type { OptionsInit, Method, Request as GotRequest, Options, PlainResponse } from 'got-scraping';
import { gotScraping, TimeoutError } from 'got-scraping';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { OptionsInit, Method, Request as GotRequest, Options, PlainResponse, TimeoutError as TimeoutErrorClass } from 'got-scraping';
import iconv from 'iconv-lite';
import mime from 'mime-types';
import ow from 'ow';
import type { JsonValue } from 'type-fest';

let TimeoutError: typeof TimeoutErrorClass;

/**
* Default mime types, which HttpScraper supports.
*/
Expand Down Expand Up @@ -607,6 +609,10 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
* received content type matches text/html, application/xml, application/xhtml+xml.
*/
protected async _requestFunction({ request, session, proxyUrl, gotOptions }: RequestFunctionOptions): Promise<PlainResponse> {
if (!TimeoutError) {
({ TimeoutError } = await import('got-scraping'));
}

const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);

try {
Expand Down Expand Up @@ -800,8 +806,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
* @internal wraps public utility for mocking purposes
*/
private _requestAsBrowser = async (options: OptionsInit & { isStream: true }, session?: Session) => {
return new Promise<PlainResponse>((resolve, reject) => {
const stream = gotScraping(options);
// eslint-disable-next-line no-async-promise-executor
return new Promise<PlainResponse>(async (resolve, reject) => {
// This await may not be needed after the initial call, but this is needed to actually get got-scraping loaded
// eslint-disable-next-line @typescript-eslint/await-thenable
const stream = await gotScraping(options);

stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => {
if (this.persistCookiesPerSession) {
Expand Down
Expand Up @@ -15,6 +15,7 @@ import type {
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering, tryAbsoluteURL } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import type * as cheerio from 'cheerio';
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
import { DOMParser } from 'linkedom/cached';

export type LinkeDOMErrorHandler<
Expand Down
2 changes: 1 addition & 1 deletion packages/utils/package.json
Expand Up @@ -51,7 +51,7 @@
"@apify/ps-tree": "^1.2.0",
"@crawlee/types": "3.5.8",
"cheerio": "^1.0.0-rc.12",
"got-scraping": "^3.2.15",
"got-scraping": "^4.0.0",
"ow": "^0.28.1",
"tslib": "^2.4.0"
}
Expand Down
2 changes: 2 additions & 0 deletions packages/utils/src/index.ts
Expand Up @@ -9,4 +9,6 @@ export * as social from './internals/social';
export * from './internals/typedefs';
export * from './internals/error_tracker';
export * from './internals/open_graph_parser';
export * from './internals/gotScraping';

export { Dictionary, Awaitable, Constructor } from '@crawlee/types';
2 changes: 1 addition & 1 deletion packages/utils/src/internals/extract-urls.ts
@@ -1,7 +1,7 @@
import { gotScraping } from 'got-scraping';
import ow from 'ow';

import { URL_NO_COMMAS_REGEX } from './general';
import { gotScraping } from './gotScraping';

export interface DownloadListOfUrlsOptions {
/**
Expand Down
11 changes: 11 additions & 0 deletions packages/utils/src/internals/gotScraping.ts
@@ -0,0 +1,11 @@
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { GotScraping } from 'got-scraping';

// eslint-disable-next-line import/no-mutable-exports -- Borrowing a book from NodeJS's code, we override the method with the imported one once the method is called
let gotScraping = (async (...args: Parameters<GotScraping>) => {
({ gotScraping } = await import('got-scraping'));

return gotScraping(...args);
}) as GotScraping;

export { gotScraping };
9 changes: 0 additions & 9 deletions test/core/crawlers/browser_crawler.test.ts
Expand Up @@ -17,22 +17,13 @@ import {
Session,
} from '@crawlee/puppeteer';
import { sleep } from '@crawlee/utils';
import { gotScraping } from 'got-scraping';
import puppeteer from 'puppeteer';
import type { HTTPResponse } from 'puppeteer';
import { runExampleComServer } from 'test/shared/_helper';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator';

import { BrowserCrawlerTest } from './basic_browser_crawler';

vitest.mock('got-scraping', async () => {
const original: typeof import('got-scraping') = await vitest.importActual('got-scraping');
return {
...original,
gotScraping: vitest.fn(),
};
});

describe('BrowserCrawler', () => {
let prevEnvHeadless: string;
let logLevel: number;
Expand Down
1 change: 1 addition & 0 deletions test/core/crawlers/cheerio_crawler.test.ts
Expand Up @@ -21,6 +21,7 @@ import {
} from '@crawlee/cheerio';
import { sleep } from '@crawlee/utils';
import type { Dictionary } from '@crawlee/utils';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood\
import type { OptionsInit } from 'got-scraping';
import iconv from 'iconv-lite';
import { runExampleComServer, responseSamples } from 'test/shared/_helper';
Expand Down
9 changes: 4 additions & 5 deletions test/core/request_list.test.ts
@@ -1,7 +1,6 @@
import log from '@apify/log';
import { Configuration, deserializeArray, EventType, KeyValueStore, ProxyConfiguration, Request, RequestList } from '@crawlee/core';
import { sleep } from '@crawlee/utils';
import { gotScraping } from 'got-scraping';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator';

/**
Expand All @@ -16,14 +15,14 @@ function shuffle(array: unknown[]) : unknown[] {
return out;
}

vitest.mock('got-scraping', async () => {
const original: typeof import('got-scraping') = await vitest.importActual('got-scraping');
vitest.mock('@crawlee/utils/src/internals/gotScraping', async () => {
return {
...original,
gotScraping: vitest.fn(original.gotScraping),
gotScraping: vitest.fn(),
};
});

const { gotScraping } = await import('@crawlee/utils/src/internals/gotScraping');

const gotScrapingSpy = vitest.mocked(gotScraping);

describe('RequestList', () => {
Expand Down
9 changes: 4 additions & 5 deletions test/core/storages/request_queue.test.ts
Expand Up @@ -10,18 +10,17 @@ import {
ProxyConfiguration,
} from '@crawlee/core';
import { sleep } from '@crawlee/utils';
import { gotScraping } from 'got-scraping';

import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator';

vitest.mock('got-scraping', async () => {
const original: typeof import('got-scraping') = await vitest.importActual('got-scraping');
vitest.mock('@crawlee/utils/src/internals/gotScraping', async () => {
return {
...original,
gotScraping: vitest.fn(original.gotScraping),
gotScraping: vitest.fn(),
};
});

const { gotScraping } = await import('@crawlee/utils/src/internals/gotScraping');

const gotScrapingSpy = vitest.mocked(gotScraping);

describe('RequestQueue remote', () => {
Expand Down
7 changes: 6 additions & 1 deletion test/tsconfig.json
Expand Up @@ -9,6 +9,11 @@
"noUnusedLocals": false,
"noUnusedParameters": false,
"strictNullChecks": false,
"types": ["vitest/globals"]
"types": ["vitest/globals"],
"module": "Node16",
"moduleResolution": "Bundler",
"paths": {
"@crawlee/utils/src/*": ["packages/utils/src/*"]
}
}
}
15 changes: 5 additions & 10 deletions test/utils/extract-urls.test.ts
Expand Up @@ -6,23 +6,18 @@ import {
extractUrls,
URL_WITH_COMMAS_REGEX,
} from '@crawlee/utils';
import { gotScraping } from 'got-scraping';

const baseDataPath = path.join(__dirname, '..', 'shared', 'data');

vitest.mock('got-scraping', async () => {
const original: typeof import('got-scraping') = await vitest.importActual('got-scraping');
vitest.mock('@crawlee/utils/src/internals/gotScraping', async () => {
return {
...original,
gotScraping: vitest.fn(),
};
});

const gotScrapingSpy = vitest.mocked(gotScraping);
const { gotScraping } = await import('@crawlee/utils/src/internals/gotScraping');

afterAll(() => {
vitest.doUnmock('got-scraping');
});
const baseDataPath = path.join(__dirname, '..', 'shared', 'data');

const gotScrapingSpy = vitest.mocked(gotScraping);

describe('downloadListOfUrls()', () => {
test('downloads a list of URLs', async () => {
Expand Down
4 changes: 3 additions & 1 deletion tsconfig.build.json
Expand Up @@ -11,7 +11,9 @@
"allowJs": true,
"skipLibCheck": true,
"resolveJsonModule": false,
"emitDecoratorMetadata": false
"emitDecoratorMetadata": false,
"module": "Node16",
"moduleResolution": "Node16"
},
"include": [
"./packages/*/src/**/*"
Expand Down
2 changes: 1 addition & 1 deletion vitest.config.ts
Expand Up @@ -16,7 +16,7 @@ if (isCI) {
export default defineConfig({
plugins: [tsconfigPaths()],
esbuild: {
target: 'es2021',
target: 'es2022',
keepNames: true,
},
test: {
Expand Down

0 comments on commit 2f05ed2

Please sign in to comment.