From c804219e65a606f21bc68016242151f24055a747 Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Mon, 17 Mar 2025 15:42:32 +0100 Subject: [PATCH 1/4] fix: support all input fields as query params in standby (mainly proxy) --- README.md | 3 +-- src/utils.ts | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ccd329e..5821fd7 100644 --- a/README.md +++ b/README.md @@ -121,8 +121,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters: | `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. | | `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. | | `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. | - - +Standby mode also supports the rest of the input parameters [described on the Actor page](https://apify.com/apify/rag-web-browser/input-schema). Object parameters like `proxyConfiguration` should be passed as url-encoded JSON strings. ## 🔌 Integration with LLMs diff --git a/src/utils.ts b/src/utils.ts index 4028e28..d303c00 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -3,9 +3,30 @@ import { parse, ParsedUrlQuery } from 'querystring'; import { defaults } from './const.js'; import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js'; +import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; export function parseParameters(url: string): ParsedUrlQuery { - return parse(url.slice(1)); + const params = parse(url.slice(1)); + + // Parse non-primitive parameters following input schema + type SupportedParams = keyof typeof inputSchema.properties + for (const [key, value] of Object.entries(params)) { + // If the key is not supported by schema, skip it + if (!Object.keys(inputSchema.properties).includes(key)) { + log.warning(`Unknown parameter: '${key}', skipping it. Supported parameters: ${Object.keys(defaults).join(', ')}`); + continue; + } + const typedKey = key as SupportedParams; + if (['object', 'array'].includes(inputSchema.properties[typedKey].type) && typeof value === 'string') { + try { + params[key] = JSON.parse(value); + } catch (e) { + log.warning(`Failed to parse parameter ${key}, it must be valid JSON. Skipping it: ${e}`); + } + } + } + + return params; } /** From 85a96ff29d5270d4c4bdd10bee6f73a0bb70f284 Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Tue, 18 Mar 2025 19:41:32 +0100 Subject: [PATCH 2/4] fix: remove duplicate warning for extra params --- src/utils.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/utils.ts b/src/utils.ts index d303c00..8f37354 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -13,7 +13,6 @@ export function parseParameters(url: string): ParsedUrlQuery { for (const [key, value] of Object.entries(params)) { // If the key is not supported by schema, skip it if (!Object.keys(inputSchema.properties).includes(key)) { - log.warning(`Unknown parameter: '${key}', skipping it. Supported parameters: ${Object.keys(defaults).join(', ')}`); continue; } const typedKey = key as SupportedParams; From 3ac1f49b83c1d8ff2cabfa3f03ec2e15300bd4ba Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Wed, 19 Mar 2025 10:33:47 +0100 Subject: [PATCH 3/4] refactor: cleanup double checking of extra params --- src/search.ts | 2 -- src/utils.ts | 32 ++++++++++++-------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/search.ts b/src/search.ts index 306f7f4..5942928 100644 --- a/src/search.ts +++ b/src/search.ts @@ -9,7 +9,6 @@ import { createResponsePromise } from './responses.js'; import { Input, Output, ContentScraperSettings, ContentCrawlerOptions } from './types.js'; import { addTimeMeasureEvent, - checkAndRemoveExtraParams, createRequest, createSearchRequest, interpretAsUrl, @@ -100,7 +99,6 @@ export async function handleSearchRequest(request: IncomingMessage, response: Se try { const params = parseParameters(request.url?.slice(Routes.SEARCH.length) ?? ''); log.info(`Received query parameters: ${JSON.stringify(params)}`); - checkAndRemoveExtraParams(params); const results = await runSearchProcess(params); diff --git a/src/utils.ts b/src/utils.ts index 8f37354..56df958 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,7 +1,6 @@ import { RequestOptions, log, ProxyConfiguration } from 'crawlee'; import { parse, ParsedUrlQuery } from 'querystring'; -import { defaults } from './const.js'; import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; @@ -9,37 +8,30 @@ export function parseParameters(url: string): ParsedUrlQuery { const params = parse(url.slice(1)); // Parse non-primitive parameters following input schema - type SupportedParams = keyof typeof inputSchema.properties + type SupportedParams = keyof typeof inputSchema.properties; + + const parsedValidatedParams = {} as Record; for (const [key, value] of Object.entries(params)) { - // If the key is not supported by schema, skip it - if (!Object.keys(inputSchema.properties).includes(key)) { + // If the key is not supported by schema or is not Apify API token, skip it + if (key !== 'token' && !Object.keys(inputSchema.properties).includes(key)) { + log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(inputSchema.properties).join(', ')}`); continue; } const typedKey = key as SupportedParams; if (['object', 'array'].includes(inputSchema.properties[typedKey].type) && typeof value === 'string') { try { - params[key] = JSON.parse(value); + parsedValidatedParams[typedKey] = JSON.parse(value); } catch (e) { log.warning(`Failed to parse parameter ${key}, it must be valid JSON. Skipping it: ${e}`); } + } else { + parsedValidatedParams[typedKey] = value; } } - return params; -} - -/** - * Check whether the query parameters are valid (do not support extra parameters) - */ -export function checkAndRemoveExtraParams(params: ParsedUrlQuery) { - const keys = Object.keys(defaults); - keys.push('token', '?token'); // token is a special parameter - for (const key of Object.keys(params)) { - if (!keys.includes(key)) { - log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(defaults).join(', ')}`); - delete params[key]; - } - } + // TODO: We should unify the type for parameters to single source, + // now we have ParsedUrlQuery, Input and SupportedParams + return parsedValidatedParams as ParsedUrlQuery; } export function randomId() { From a984d89aff3a6539017ef13aa310b1dbc5a029c6 Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Wed, 19 Mar 2025 15:14:21 +0100 Subject: [PATCH 4/4] fix: iterate all 'defaults' instead of just input schema props --- src/utils.ts | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/utils.ts b/src/utils.ts index 56df958..b47819a 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,26 +1,35 @@ import { RequestOptions, log, ProxyConfiguration } from 'crawlee'; import { parse, ParsedUrlQuery } from 'querystring'; +import { defaults } from './const.js'; import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js'; import inputSchema from '../.actor/input_schema.json' with { type: 'json' }; export function parseParameters(url: string): ParsedUrlQuery { const params = parse(url.slice(1)); - // Parse non-primitive parameters following input schema - type SupportedParams = keyof typeof inputSchema.properties; + type SupportedParamKey = keyof typeof defaults; - const parsedValidatedParams = {} as Record; + const parsedValidatedParams = {} as Record; for (const [key, value] of Object.entries(params)) { // If the key is not supported by schema or is not Apify API token, skip it - if (key !== 'token' && !Object.keys(inputSchema.properties).includes(key)) { - log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(inputSchema.properties).join(', ')}`); + if (key !== 'token' && !Object.keys(defaults).includes(key)) { + log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(defaults).join(', ')}`); continue; } - const typedKey = key as SupportedParams; - if (['object', 'array'].includes(inputSchema.properties[typedKey].type) && typeof value === 'string') { + const typedKey = key as SupportedParamKey; + // Schema keys are subset of SupportedParams so we can safely cast + type SchemaKey = keyof typeof inputSchema.properties; + + // Parse non-primitive parameters following input schema because querystring doesn't parse objects + if ( + !!inputSchema.properties[typedKey as SchemaKey] + && ['object', 'array'].includes(inputSchema.properties[typedKey as SchemaKey].type) + && typeof value === 'string' + ) { try { parsedValidatedParams[typedKey] = JSON.parse(value); + log.debug(`Parsed parameter ${key} from string: ${value} to object`, parsedValidatedParams[typedKey] as object); } catch (e) { log.warning(`Failed to parse parameter ${key}, it must be valid JSON. Skipping it: ${e}`); }