From af8ccb5e1075bfbd053d99d3738128ea3f853db1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Cimbulka?= Date: Mon, 24 Mar 2025 15:18:52 +0100 Subject: [PATCH 1/5] refactor: Remove defaults from `processInputInternal` function --- src/input.ts | 32 ++++++++++++++++++++------------ src/types.ts | 4 ---- src/utils.ts | 15 ++++++++++++++- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/input.ts b/src/input.ts index d0f9822..d831732 100644 --- a/src/input.ts +++ b/src/input.ts @@ -4,13 +4,14 @@ import { firefox } from 'playwright'; import { ContentCrawlerTypes, defaults } from './const.js'; import { UserInputError } from './errors.js'; -import type { Input, ContentScraperSettings, OutputFormats, StandbyInput, ContentCrawlerOptions } from './types.js'; +import type { Input, ContentScraperSettings, OutputFormats, ContentCrawlerOptions } from './types.js'; +import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; /** * Processes the input and returns an array of crawler settings. This is ideal for startup of STANDBY mode * because it makes it simple to start all crawlers at once. */ -export async function processStandbyInput(originalInput: Partial | Partial) { +export async function processStandbyInput(originalInput: Partial) { const { input, searchCrawlerOptions, contentScraperSettings } = await processInputInternal(originalInput, true); const proxy = await Actor.createProxyConfiguration(input.proxyConfiguration); @@ -25,7 +26,7 @@ export async function processStandbyInput(originalInput: Partial | Partia /** * Processes the input and returns the settings for the crawler. */ -export async function processInput(originalInput: Partial | Partial) { +export async function processInput(originalInput: Partial) { const { input, searchCrawlerOptions, contentScraperSettings } = await processInputInternal(originalInput); const proxy = await Actor.createProxyConfiguration(input.proxyConfiguration); @@ -40,15 +41,12 @@ export async function processInput(originalInput: Partial | Partial | Partial, + originalInput: Partial, standbyInit: boolean = false, ) { - if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') { - originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; - } - const input = { ...defaults, ...originalInput } as Input; + // const input = { ...defaults, ...originalInput } as Input; - validateAndFillInput(input, standbyInit); + const input = validateAndFillInput(originalInput, standbyInit); const { debugMode, @@ -139,7 +137,7 @@ function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | u * Do not validate query parameter when standbyInit is true. * This is a bit ugly, but it's necessary to avoid throwing an error when the query is not provided in standby mode. */ -export function validateAndFillInput(input: Input, standbyInit: boolean) { +export function validateAndFillInput(input: Partial, standbyInit: boolean): Input { const validateRange = ( value: number | string | undefined, min: number, @@ -162,10 +160,16 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) { } return value; }; + + // Throw an error if the query is not provided and standbyInit is false. if (!input.query && !standbyInit) { throw new UserInputError('The `query` parameter must be provided and non-empty.'); } + if (!input.keepAlive) { + input.keepAlive = true; + } + input.maxResults = validateRange(input.maxResults, 1, defaults.maxResultsMax, defaults.maxResults, 'maxResults'); input.requestTimeoutSecs = validateRange(input.requestTimeoutSecs, 1, defaults.requestTimeoutSecsMax, defaults.requestTimeoutSecs, 'requestTimeoutSecs'); input.serpMaxRetries = validateRange(input.serpMaxRetries, 0, defaults.serpMaxRetriesMax, defaults.serpMaxRetries, 'serpMaxRetries'); @@ -177,13 +181,17 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) { } else if (input.outputFormats.some((format) => !['text', 'markdown', 'html'].includes(format))) { throw new UserInputError('The `outputFormats` array may only contain `text`, `markdown`, or `html`.'); } - if (input.serpProxyGroup !== 'GOOGLE_SERP' && input.serpProxyGroup !== 'SHADER') { + if (!input.serpProxyGroup || input.serpProxyGroup.length === 0) { + input.serpProxyGroup = inputSchema.properties.serpProxyGroup.default as 'GOOGLE_SERP' | 'SHADER'; + } else if (input.serpProxyGroup !== 'GOOGLE_SERP' && input.serpProxyGroup !== 'SHADER') { throw new UserInputError('The `serpProxyGroup` parameter must be either `GOOGLE_SERP` or `SHADER`.'); } - if (input.dynamicContentWaitSecs >= input.requestTimeoutSecs) { + if (!input.dynamicContentWaitSecs || input.dynamicContentWaitSecs >= input.requestTimeoutSecs) { input.dynamicContentWaitSecs = Math.round(input.requestTimeoutSecs / 2); } if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') { throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.'); } + + return input as Input; } diff --git a/src/types.ts b/src/types.ts index 8cd14ab..3e4a580 100644 --- a/src/types.ts +++ b/src/types.ts @@ -34,10 +34,6 @@ export type Input = { scrapingTool: 'browser-playwright' | 'raw-http'; }; -export type StandbyInput = Input & { - outputFormats: OutputFormats[] | string -} - export type OrganicResult = { description?: string; title?: string; diff --git a/src/utils.ts b/src/utils.ts index b47819a..95e722e 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -2,7 +2,14 @@ import { RequestOptions, log, ProxyConfiguration } from 'crawlee'; import { parse, ParsedUrlQuery } from 'querystring'; import { defaults } from './const.js'; -import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js'; +import { + OrganicResult, + ContentScraperSettings, + TimeMeasure, + ContentCrawlerUserData, + SearchCrawlerUserData, + type OutputFormats, +} from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; export function parseParameters(url: string): ParsedUrlQuery { @@ -17,10 +24,16 @@ export function parseParameters(url: string): ParsedUrlQuery { log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(defaults).join(', ')}`); continue; } + const typedKey = key as SupportedParamKey; // Schema keys are subset of SupportedParams so we can safely cast type SchemaKey = keyof typeof inputSchema.properties; + // Handle output formats as array + if (typedKey === 'outputFormats' && typeof value === 'string') { + parsedValidatedParams[typedKey] = value.split(',').map((format) => format.trim()) as OutputFormats[]; + } + // Parse non-primitive parameters following input schema because querystring doesn't parse objects if ( !!inputSchema.properties[typedKey as SchemaKey] From 4447af6bfba998d71a734be7cf6b26915e72eb48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Cimbulka?= Date: Mon, 24 Mar 2025 17:23:47 +0100 Subject: [PATCH 2/5] refactor: Rewrite `validateAndFillInput` function --- src/input.ts | 148 ++++++++++++++++++++++++++++++++++++++++++-------- src/search.ts | 6 +- src/types.ts | 9 ++- 3 files changed, 134 insertions(+), 29 deletions(-) diff --git a/src/input.ts b/src/input.ts index d831732..d167d39 100644 --- a/src/input.ts +++ b/src/input.ts @@ -1,10 +1,18 @@ -import { Actor } from 'apify'; +import { Actor, ProxyConfigurationOptions } from 'apify'; +import { val } from 'cheerio/lib/api/attributes'; import { BrowserName, CheerioCrawlerOptions, log, ProxyConfiguration } from 'crawlee'; import { firefox } from 'playwright'; import { ContentCrawlerTypes, defaults } from './const.js'; import { UserInputError } from './errors.js'; -import type { Input, ContentScraperSettings, OutputFormats, ContentCrawlerOptions } from './types.js'; +import type { + Input, + ContentScraperSettings, + OutputFormats, + ContentCrawlerOptions, + ScrapingTool, + SERPProxyGroup +} from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; /** @@ -31,8 +39,8 @@ export async function processInput(originalInput: Partial) { const proxy = await Actor.createProxyConfiguration(input.proxyConfiguration); const contentCrawlerOptions: ContentCrawlerOptions = input.scrapingTool === 'raw-http' - ? createCheerioCrawlerOptions(input, proxy) - : createPlaywrightCrawlerOptions(input, proxy); + ? createCheerioCrawlerOptions(input, proxy, false) + : createPlaywrightCrawlerOptions(input, proxy, false); return { input, searchCrawlerOptions, contentCrawlerOptions, contentScraperSettings }; } @@ -51,7 +59,6 @@ async function processInputInternal( const { debugMode, dynamicContentWaitSecs, - keepAlive, serpMaxRetries, serpProxyGroup, readableTextCharThreshold, @@ -62,7 +69,7 @@ async function processInputInternal( const proxySearch = await Actor.createProxyConfiguration({ groups: [serpProxyGroup] }); const searchCrawlerOptions: CheerioCrawlerOptions = { - keepAlive, + keepAlive: standbyInit, maxRequestRetries: serpMaxRetries, proxyConfiguration: proxySearch, autoscaledPoolOptions: { desiredConcurrency: 1 }, @@ -82,8 +89,12 @@ async function processInputInternal( return { input, searchCrawlerOptions, contentScraperSettings }; } -function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration | undefined): ContentCrawlerOptions { - const { keepAlive, maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; +function createPlaywrightCrawlerOptions( + input: Input, + proxy: ProxyConfiguration | undefined, + keepAlive: boolean = true, +): ContentCrawlerOptions { + const { maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; return { type: ContentCrawlerTypes.PLAYWRIGHT, @@ -113,8 +124,12 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration }; } -function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | undefined): ContentCrawlerOptions { - const { keepAlive, maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; +function createCheerioCrawlerOptions( + input: Input, + proxy: ProxyConfiguration | undefined, + keepAlive: boolean = true, +): ContentCrawlerOptions { + const { maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; return { type: ContentCrawlerTypes.CHEERIO, @@ -137,7 +152,7 @@ function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | u * Do not validate query parameter when standbyInit is true. * This is a bit ugly, but it's necessary to avoid throwing an error when the query is not provided in standby mode. */ -export function validateAndFillInput(input: Partial, standbyInit: boolean): Input { +function validateAndFillInput(input: Partial, standbyInit: boolean): Input { const validateRange = ( value: number | string | undefined, min: number, @@ -166,31 +181,120 @@ export function validateAndFillInput(input: Partial, standbyInit: boolean throw new UserInputError('The `query` parameter must be provided and non-empty.'); } - if (!input.keepAlive) { - input.keepAlive = true; - } - - input.maxResults = validateRange(input.maxResults, 1, defaults.maxResultsMax, defaults.maxResults, 'maxResults'); - input.requestTimeoutSecs = validateRange(input.requestTimeoutSecs, 1, defaults.requestTimeoutSecsMax, defaults.requestTimeoutSecs, 'requestTimeoutSecs'); - input.serpMaxRetries = validateRange(input.serpMaxRetries, 0, defaults.serpMaxRetriesMax, defaults.serpMaxRetries, 'serpMaxRetries'); - input.maxRequestRetries = validateRange(input.maxRequestRetries, 0, defaults.maxRequestRetriesMax, defaults.maxRequestRetries, 'maxRequestRetries'); + // Max results + input.maxResults = validateRange( + input.maxResults, + inputSchema.properties.maxResults.minimum, + inputSchema.properties.maxResults.maximum, + inputSchema.properties.maxResults.default, + 'maxResults', + ); + // Output formats if (!input.outputFormats || input.outputFormats.length === 0) { input.outputFormats = defaults.outputFormats as OutputFormats[]; log.warning(`The \`outputFormats\` parameter must be a non-empty array. Using default value \`${defaults.outputFormats}\`.`); } else if (input.outputFormats.some((format) => !['text', 'markdown', 'html'].includes(format))) { throw new UserInputError('The `outputFormats` array may only contain `text`, `markdown`, or `html`.'); } + + // Request timout seconds + input.requestTimeoutSecs = validateRange( + input.requestTimeoutSecs, + inputSchema.properties.requestTimeoutSecs.minimum, + inputSchema.properties.requestTimeoutSecs.maximum, + inputSchema.properties.requestTimeoutSecs.default, + 'requestTimeoutSecs', + ); + + // SERP proxy group if (!input.serpProxyGroup || input.serpProxyGroup.length === 0) { - input.serpProxyGroup = inputSchema.properties.serpProxyGroup.default as 'GOOGLE_SERP' | 'SHADER'; + input.serpProxyGroup = inputSchema.properties.serpProxyGroup.default as SERPProxyGroup; } else if (input.serpProxyGroup !== 'GOOGLE_SERP' && input.serpProxyGroup !== 'SHADER') { throw new UserInputError('The `serpProxyGroup` parameter must be either `GOOGLE_SERP` or `SHADER`.'); } + + // SERP max retries + input.serpMaxRetries = validateRange( + input.serpMaxRetries, + inputSchema.properties.serpMaxRetries.minimum, + inputSchema.properties.serpMaxRetries.maximum, + inputSchema.properties.serpMaxRetries.default, + 'serpMaxRetries', + ); + + // Proxy configuration + if (!input.proxyConfiguration) { + input.proxyConfiguration = inputSchema.properties.proxyConfiguration.default as ProxyConfigurationOptions; + } + + // Scraping tool + if (!input.scrapingTool) { + input.scrapingTool = inputSchema.properties.scrapingTool.default as ScrapingTool; + } else if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') { + throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.'); + } + + // Remove elements CSS selector + if (!input.removeElementsCssSelector) { + input.removeElementsCssSelector = inputSchema.properties.removeElementsCssSelector.default; + } + + // TODO: Is this input necessary? + // HTML transformer + // if (!input.htmlTransformer) { + // input.htmlTransformer = inputSchema.properties.htmlTransformer.default; + // } + + // Initial concurrency + input.initialConcurrency = validateRange( + input.initialConcurrency, + inputSchema.properties.initialConcurrency.minimum, + inputSchema.properties.initialConcurrency.maximum, + inputSchema.properties.initialConcurrency.default, + 'initialConcurrency', + ); + + // Min concurrency + input.minConcurrency = validateRange( + input.minConcurrency, + inputSchema.properties.minConcurrency.minimum, + inputSchema.properties.minConcurrency.maximum, + inputSchema.properties.minConcurrency.default, + 'minConcurrency', + ); + + // Max concurrency + input.maxConcurrency = validateRange( + input.maxConcurrency, + inputSchema.properties.maxConcurrency.minimum, + inputSchema.properties.maxConcurrency.maximum, + inputSchema.properties.maxConcurrency.default, + 'maxConcurrency', + ); + + // Max request retries + input.maxRequestRetries = validateRange( + input.maxRequestRetries, + inputSchema.properties.maxRequestRetries.minimum, + inputSchema.properties.maxRequestRetries.maximum, + inputSchema.properties.maxRequestRetries.default, + 'maxRequestRetries', + ); + + // Dynamic content wait seconds if (!input.dynamicContentWaitSecs || input.dynamicContentWaitSecs >= input.requestTimeoutSecs) { input.dynamicContentWaitSecs = Math.round(input.requestTimeoutSecs / 2); } - if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') { - throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.'); + + // Remove cookie warnings + if (input.removeCookieWarnings === undefined) { + input.removeCookieWarnings = inputSchema.properties.removeCookieWarnings.default; + } + + // Debug mode + if (input.debugMode === undefined) { + input.debugMode = inputSchema.properties.debugMode.default; } return input as Input; diff --git a/src/search.ts b/src/search.ts index 5942928..b31ef57 100644 --- a/src/search.ts +++ b/src/search.ts @@ -65,6 +65,10 @@ async function runSearchProcess(params: Partial): Promise { contentScraperSettings, } = await processInput(params); + // Set keepAlive to true to find the correct crawlers + searchCrawlerOptions.keepAlive = true; + contentCrawlerOptions.crawlerOptions.keepAlive = true; + await createAndStartSearchCrawler(searchCrawlerOptions); const { key: contentCrawlerKey } = await createAndStartContentCrawler(contentCrawlerOptions); @@ -137,8 +141,6 @@ export async function handleSearchNormalMode(input: Input, contentScraperSettings: ContentScraperSettings, ) { const startedTime = Date.now(); - searchCrawlerOptions.keepAlive = false; - contentCrawlerOptions.crawlerOptions.keepAlive = false; contentCrawlerOptions.crawlerOptions.requestHandlerTimeoutSecs = PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS; const { crawler: searchCrawler } = await createAndStartSearchCrawler(searchCrawlerOptions, false); diff --git a/src/types.ts b/src/types.ts index 3e4a580..62cdd07 100644 --- a/src/types.ts +++ b/src/types.ts @@ -4,19 +4,18 @@ import { CheerioCrawlerOptions, PlaywrightCrawlerOptions } from 'crawlee'; import { ContentCrawlerTypes } from './const'; export type OutputFormats = 'text' | 'markdown' | 'html'; +export type SERPProxyGroup = 'GOOGLE_SERP' | 'SHADER'; +export type ScrapingTool = 'browser-playwright' | 'raw-http'; export type Input = { debugMode: boolean; requestTimeoutSecs: number; - // both - keepAlive: boolean; - // google search parameters countryCode: string; languageCode: string; maxResults: number; - serpProxyGroup: 'GOOGLE_SERP' | 'SHADER'; + serpProxyGroup: SERPProxyGroup; serpMaxRetries: number; query: string; @@ -31,7 +30,7 @@ export type Input = { readableTextCharThreshold: number; removeElementsCssSelector: string; removeCookieWarnings: boolean; - scrapingTool: 'browser-playwright' | 'raw-http'; + scrapingTool: ScrapingTool; }; export type OrganicResult = { From 7401ffe2a5da71124d8798a4f55569ff732a1573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Cimbulka?= Date: Wed, 26 Mar 2025 14:46:53 +0100 Subject: [PATCH 3/5] refactor: Remove `defaults` object --- src/const.ts | 29 ----------------------------- src/input.ts | 18 ++++++++---------- src/main.ts | 3 +-- src/types.ts | 1 + src/utils.ts | 46 +++++++++++++++++++++++++--------------------- 5 files changed, 35 insertions(+), 62 deletions(-) diff --git a/src/const.ts b/src/const.ts index 8df5514..6acafbd 100644 --- a/src/const.ts +++ b/src/const.ts @@ -1,5 +1,3 @@ -import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; - export enum ContentCrawlerStatus { PENDING = 'pending', HANDLED = 'handled', @@ -18,30 +16,3 @@ export enum ContentCrawlerTypes { } export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60; - -// Default values parsed from input_schema.json -export const defaults = { - debugMode: inputSchema.properties.debugMode.default, - dynamicContentWaitSecs: inputSchema.properties.dynamicContentWaitSecs.default, - htmlTransformer: inputSchema.properties.htmlTransformer.default, - initialConcurrency: inputSchema.properties.initialConcurrency.default, - keepAlive: true, // Not in input_schema.json - maxConcurrency: inputSchema.properties.maxConcurrency.default, - maxRequestRetries: inputSchema.properties.maxRequestRetries.default, - maxRequestRetriesMax: inputSchema.properties.maxRequestRetries.maximum, - maxResults: inputSchema.properties.maxResults.default, - maxResultsMax: inputSchema.properties.maxResults.maximum, - minConcurrency: inputSchema.properties.minConcurrency.default, - outputFormats: inputSchema.properties.outputFormats.default, - proxyConfiguration: inputSchema.properties.proxyConfiguration.default, - query: undefined, // No default value in input_schema.json - readableTextCharThreshold: 100, // Not in input_schema.json - removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default, - removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default, - requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default, - requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum, - serpMaxRetries: inputSchema.properties.serpMaxRetries.default, - serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum, - serpProxyGroup: inputSchema.properties.serpProxyGroup.default, - scrapingTool: inputSchema.properties.scrapingTool.default, -}; diff --git a/src/input.ts b/src/input.ts index d167d39..bf3b029 100644 --- a/src/input.ts +++ b/src/input.ts @@ -1,9 +1,8 @@ import { Actor, ProxyConfigurationOptions } from 'apify'; -import { val } from 'cheerio/lib/api/attributes'; import { BrowserName, CheerioCrawlerOptions, log, ProxyConfiguration } from 'crawlee'; import { firefox } from 'playwright'; -import { ContentCrawlerTypes, defaults } from './const.js'; +import { ContentCrawlerTypes } from './const.js'; import { UserInputError } from './errors.js'; import type { Input, @@ -11,7 +10,7 @@ import type { OutputFormats, ContentCrawlerOptions, ScrapingTool, - SERPProxyGroup + SERPProxyGroup, } from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; @@ -162,7 +161,7 @@ function validateAndFillInput(input: Partial, standbyInit: boolean): Inpu ) => { // parse the value as a number to check if it's a valid number if (value === undefined) { - log.warning(`The \`${fieldName}\` parameter must be defined. Using the default value ${defaultValue} instead.`); + log.info(`The \`${fieldName}\` parameter is not defined. Using the default value ${defaultValue}.`); return defaultValue; } if (typeof value === 'string') { value = Number(value); @@ -192,8 +191,8 @@ function validateAndFillInput(input: Partial, standbyInit: boolean): Inpu // Output formats if (!input.outputFormats || input.outputFormats.length === 0) { - input.outputFormats = defaults.outputFormats as OutputFormats[]; - log.warning(`The \`outputFormats\` parameter must be a non-empty array. Using default value \`${defaults.outputFormats}\`.`); + input.outputFormats = inputSchema.properties.outputFormats.default as OutputFormats[]; + log.info(`The \`outputFormats\` parameter is not defined. Using default value \`${input.outputFormats}\`.`); } else if (input.outputFormats.some((format) => !['text', 'markdown', 'html'].includes(format))) { throw new UserInputError('The `outputFormats` array may only contain `text`, `markdown`, or `html`.'); } @@ -240,11 +239,10 @@ function validateAndFillInput(input: Partial, standbyInit: boolean): Inpu input.removeElementsCssSelector = inputSchema.properties.removeElementsCssSelector.default; } - // TODO: Is this input necessary? // HTML transformer - // if (!input.htmlTransformer) { - // input.htmlTransformer = inputSchema.properties.htmlTransformer.default; - // } + if (!input.htmlTransformer) { + input.htmlTransformer = inputSchema.properties.htmlTransformer.default; + } // Initial concurrency input.initialConcurrency = validateRange( diff --git a/src/main.ts b/src/main.ts index 68c7931..310415d 100644 --- a/src/main.ts +++ b/src/main.ts @@ -80,8 +80,6 @@ if (standbyMode) { `); app.listen(port, async () => { - log.info(`The Actor web server is listening for user requests at ${host}:${port}`); - const promises: Promise[] = []; promises.push(createAndStartSearchCrawler(searchCrawlerOptions)); for (const settings of contentCrawlerOptions) { @@ -89,6 +87,7 @@ if (standbyMode) { } await Promise.all(promises); + log.info(`The Actor web server is listening for user requests at ${host}:${port}`); }); } else { log.info('Actor is running in the NORMAL mode.'); diff --git a/src/types.ts b/src/types.ts index 62cdd07..83b1785 100644 --- a/src/types.ts +++ b/src/types.ts @@ -29,6 +29,7 @@ export type Input = { proxyConfiguration: ProxyConfigurationOptions; readableTextCharThreshold: number; removeElementsCssSelector: string; + htmlTransformer: string; removeCookieWarnings: boolean; scrapingTool: ScrapingTool; }; diff --git a/src/utils.ts b/src/utils.ts index 95e722e..6a88fb4 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,59 +1,63 @@ import { RequestOptions, log, ProxyConfiguration } from 'crawlee'; -import { parse, ParsedUrlQuery } from 'querystring'; +import { parse } from 'querystring'; -import { defaults } from './const.js'; import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData, - type OutputFormats, + type OutputFormats, Input, } from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; -export function parseParameters(url: string): ParsedUrlQuery { +/** + * Parse the query parameters from the URL + */ +export function parseParameters(url: string): Partial { const params = parse(url.slice(1)); - type SupportedParamKey = keyof typeof defaults; + type SchemaKey = keyof typeof inputSchema.properties; - const parsedValidatedParams = {} as Record; + const parsedInput: Partial = {}; for (const [key, value] of Object.entries(params)) { + // If the value is undefined skip it + if (value === undefined) continue; + // If the key is not supported by schema or is not Apify API token, skip it - if (key !== 'token' && !Object.keys(defaults).includes(key)) { - log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(defaults).join(', ')}`); + if (key !== 'token' && !Object.keys(inputSchema.properties).includes(key)) { + log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(inputSchema.properties).join(', ')}`); continue; } - const typedKey = key as SupportedParamKey; - // Schema keys are subset of SupportedParams so we can safely cast - type SchemaKey = keyof typeof inputSchema.properties; + const typedKey = key as SchemaKey; + // const inputKey = key as keyof typeof parsedInput; - // Handle output formats as array + // Parse outputFormats parameter as an array of OutputFormats if (typedKey === 'outputFormats' && typeof value === 'string') { - parsedValidatedParams[typedKey] = value.split(',').map((format) => format.trim()) as OutputFormats[]; + parsedInput[typedKey] = value.split(',').map((format) => format.trim()) as OutputFormats[]; } // Parse non-primitive parameters following input schema because querystring doesn't parse objects if ( - !!inputSchema.properties[typedKey as SchemaKey] - && ['object', 'array'].includes(inputSchema.properties[typedKey as SchemaKey].type) + !!inputSchema.properties[typedKey] + && ['object', 'array'].includes(inputSchema.properties[typedKey].type) && typeof value === 'string' ) { try { - parsedValidatedParams[typedKey] = JSON.parse(value); - log.debug(`Parsed parameter ${key} from string: ${value} to object`, parsedValidatedParams[typedKey] as object); + parsedInput[typedKey] = JSON.parse(value); + log.debug(`Parsed parameter ${key} from string: ${value} to object`, parsedInput[typedKey] as object); } catch (e) { log.warning(`Failed to parse parameter ${key}, it must be valid JSON. Skipping it: ${e}`); } } else { - parsedValidatedParams[typedKey] = value; + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + parsedInput[typedKey] = value; } } - // TODO: We should unify the type for parameters to single source, - // now we have ParsedUrlQuery, Input and SupportedParams - return parsedValidatedParams as ParsedUrlQuery; + return parsedInput; } export function randomId() { From ae1817e8199b29efd629edbde05fb74b715995d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Cimbulka?= Date: Wed, 26 Mar 2025 14:59:46 +0100 Subject: [PATCH 4/5] refactor: Remove unnecessary input parameters --- .actor/input_schema.json | 24 +++--------------- src/input.ts | 53 +++++++++++++--------------------------- src/types.ts | 4 +-- 3 files changed, 21 insertions(+), 60 deletions(-) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index eb50f52..eb9cec9 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -97,33 +97,15 @@ "prefill": "none", "editor": "hidden" }, - "initialConcurrency": { - "title": "Initial browsing concurrency", + "desiredConcurrency": { + "title": "Desired browsing concurrency", "type": "integer", - "description": "The initial number of web browsers running in parallel. The system automatically scales the number based on the CPU and memory usage, in the range specified by `minConcurrency` and `maxConcurrency`. If the initial value is `0`, the Actor picks the number automatically based on the available memory.", + "description": "The desired number of web browsers running in parallel. The system automatically scales the number based on the CPU and memory usage. If the initial value is `0`, the Actor picks the number automatically based on the available memory.", "minimum": 0, "maximum": 50, "default": 5, "editor": "hidden" }, - "minConcurrency": { - "title": "Minimum browsing concurrency", - "type": "integer", - "description": "The minimum number of web browsers running in parallel.", - "minimum": 1, - "maximum": 50, - "default": 3, - "editor": "hidden" - }, - "maxConcurrency": { - "title": "Maximum browsing concurrency", - "type": "integer", - "description": "The maximum number of web browsers running in parallel.", - "minimum": 1, - "maximum": 100, - "default": 50, - "editor": "hidden" - }, "maxRequestRetries": { "title": "Target page max retries", "type": "integer", diff --git a/src/input.ts b/src/input.ts index bf3b029..084d345 100644 --- a/src/input.ts +++ b/src/input.ts @@ -60,7 +60,10 @@ async function processInputInternal( dynamicContentWaitSecs, serpMaxRetries, serpProxyGroup, + outputFormats, readableTextCharThreshold, + removeElementsCssSelector, + htmlTransformer, removeCookieWarnings, } = input; @@ -77,12 +80,12 @@ async function processInputInternal( const contentScraperSettings: ContentScraperSettings = { debugMode, dynamicContentWaitSecs, - htmlTransformer: 'none', + htmlTransformer, maxHtmlCharsToProcess: 1.5e6, - outputFormats: input.outputFormats as OutputFormats[], + outputFormats, readableTextCharThreshold, removeCookieWarnings, - removeElementsCssSelector: input.removeElementsCssSelector, + removeElementsCssSelector, }; return { input, searchCrawlerOptions, contentScraperSettings }; @@ -93,7 +96,7 @@ function createPlaywrightCrawlerOptions( proxy: ProxyConfiguration | undefined, keepAlive: boolean = true, ): ContentCrawlerOptions { - const { maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; + const { maxRequestRetries, desiredConcurrency } = input; return { type: ContentCrawlerTypes.PLAYWRIGHT, @@ -115,9 +118,7 @@ function createPlaywrightCrawlerOptions( retireInactiveBrowserAfterSecs: 60, }, autoscaledPoolOptions: { - desiredConcurrency: initialConcurrency === 0 ? undefined : Math.min(initialConcurrency, maxConcurrency), - maxConcurrency, - minConcurrency, + desiredConcurrency, }, }, }; @@ -128,7 +129,7 @@ function createCheerioCrawlerOptions( proxy: ProxyConfiguration | undefined, keepAlive: boolean = true, ): ContentCrawlerOptions { - const { maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input; + const { maxRequestRetries, desiredConcurrency } = input; return { type: ContentCrawlerTypes.CHEERIO, @@ -138,9 +139,7 @@ function createCheerioCrawlerOptions( proxyConfiguration: proxy, requestHandlerTimeoutSecs: input.requestTimeoutSecs, autoscaledPoolOptions: { - desiredConcurrency: initialConcurrency === 0 ? undefined : Math.min(initialConcurrency, maxConcurrency), - maxConcurrency, - minConcurrency, + desiredConcurrency, }, }, }; @@ -244,31 +243,13 @@ function validateAndFillInput(input: Partial, standbyInit: boolean): Inpu input.htmlTransformer = inputSchema.properties.htmlTransformer.default; } - // Initial concurrency - input.initialConcurrency = validateRange( - input.initialConcurrency, - inputSchema.properties.initialConcurrency.minimum, - inputSchema.properties.initialConcurrency.maximum, - inputSchema.properties.initialConcurrency.default, - 'initialConcurrency', - ); - - // Min concurrency - input.minConcurrency = validateRange( - input.minConcurrency, - inputSchema.properties.minConcurrency.minimum, - inputSchema.properties.minConcurrency.maximum, - inputSchema.properties.minConcurrency.default, - 'minConcurrency', - ); - - // Max concurrency - input.maxConcurrency = validateRange( - input.maxConcurrency, - inputSchema.properties.maxConcurrency.minimum, - inputSchema.properties.maxConcurrency.maximum, - inputSchema.properties.maxConcurrency.default, - 'maxConcurrency', + // Desired concurrency + input.desiredConcurrency = validateRange( + input.desiredConcurrency, + inputSchema.properties.desiredConcurrency.minimum, + inputSchema.properties.desiredConcurrency.maximum, + inputSchema.properties.desiredConcurrency.default, + 'desiredConcurrency', ); // Max request retries diff --git a/src/types.ts b/src/types.ts index 83b1785..6dd0b4a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -22,10 +22,8 @@ export type Input = { // content crawler parameters dynamicContentWaitSecs: number; outputFormats: OutputFormats[]; - initialConcurrency: number; - maxConcurrency: number; + desiredConcurrency: number; maxRequestRetries: number; - minConcurrency: number; proxyConfiguration: ProxyConfigurationOptions; readableTextCharThreshold: number; removeElementsCssSelector: string; From ffa06cbafc0f52899abeea5b098b3c664b8f45ad Mon Sep 17 00:00:00 2001 From: Matyas Cimbulka Date: Fri, 28 Mar 2025 08:31:44 +0100 Subject: [PATCH 5/5] refactor: Remove table with query params from README.md --- README.md | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/README.md b/README.md index 5821fd7..8f2cef2 100644 --- a/README.md +++ b/README.md @@ -105,23 +105,7 @@ The response is a JSON array with objects containing the web content from the fo #### Query parameters -The `/search` GET HTTP endpoint accepts the following query parameters: - -| Parameter | Type | Default | Description | -|------------------------------|---------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `query` | string | N/A | Enter Google Search keywords or a URL to a specific web page. The keywords might include the [advanced search operators](https://blog.apify.com/how-to-scrape-google-like-a-pro/). You need to percent-encode the value if it contains some special characters. | -| `maxResults` | number | `3` | The maximum number of top organic Google Search results whose web pages will be extracted. If `query` is a URL, then this parameter is ignored and the Actor only fetches the specific web page. | -| `outputFormats` | string | `markdown` | Select one or more formats to which the target web pages will be extracted. Use comma to separate multiple values (e.g. `text,markdown`) | -| `requestTimeoutSecs` | number | `30` | The maximum time in seconds available for the request, including querying Google Search and scraping the target web pages. For example, OpenAI allows only [45 seconds](https://platform.openai.com/docs/actions/production#timeouts) for custom actions. If a target page loading and extraction exceeds this timeout, the corresponding page will be skipped in results to ensure at least some results are returned within the timeout. If no page is extracted within the timeout, the whole request fails. | -| `serpProxyGroup` | string | `GOOGLE_SERP` | Enables overriding the default Apify Proxy group used for fetching Google Search results. | -| `serpMaxRetries` | number | `1` | The maximum number of times the Actor will retry fetching the Google Search results on error. If the last attempt fails, the entire request fails. | -| `scrapingTool` | string | `raw-http` | Selects which scraping tool is used to extract the target websites. `browser-playwright` uses browser and can handle complex Javascript heavy website. Meanwhile `raw-http` uses simple HTTP request to fetch the HTML provided by the URL, it can't handle websites that rely on Javascript but it's about two times faster. | -| `maxRequestRetries` | number | `1` | The maximum number of times the Actor will retry loading the target web page on error. If the last attempt fails, the page will be skipped in the results. | -| `dynamicContentWaitSecs` | number | `10` | The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle. | -| `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. | -| `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. | -| `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. | -Standby mode also supports the rest of the input parameters [described on the Actor page](https://apify.com/apify/rag-web-browser/input-schema). Object parameters like `proxyConfiguration` should be passed as url-encoded JSON strings. +The `/search` GET HTTP endpoint accepts all the input parameters [described on the Actor page](https://apify.com/apify/rag-web-browser/input-schema). Object parameters like `proxyConfiguration` should be passed as url-encoded JSON strings. ## 🔌 Integration with LLMs