Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters:
| `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. |
| `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. |
| `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. |

<!-- TODO: we should probably add proxyConfiguration -->
Standby mode also supports the rest of the input parameters [described on the Actor page](https://apify.com/apify/rag-web-browser/input-schema). Object parameters like `proxyConfiguration` should be passed as url-encoded JSON strings.


## 🔌 Integration with LLMs
Expand Down
2 changes: 0 additions & 2 deletions src/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import { createResponsePromise } from './responses.js';
import { Input, Output, ContentScraperSettings, ContentCrawlerOptions } from './types.js';
import {
addTimeMeasureEvent,
checkAndRemoveExtraParams,
createRequest,
createSearchRequest,
interpretAsUrl,
Expand Down Expand Up @@ -100,7 +99,6 @@ export async function handleSearchRequest(request: IncomingMessage, response: Se
try {
const params = parseParameters(request.url?.slice(Routes.SEARCH.length) ?? '');
log.info(`Received query parameters: ${JSON.stringify(params)}`);
checkAndRemoveExtraParams(params);

const results = await runSearchProcess(params);

Expand Down
43 changes: 32 additions & 11 deletions src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,44 @@ import { parse, ParsedUrlQuery } from 'querystring';

import { defaults } from './const.js';
import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js';
import inputSchema from '../.actor/input_schema.json' with { type: 'json' };

export function parseParameters(url: string): ParsedUrlQuery {
return parse(url.slice(1));
}
const params = parse(url.slice(1));

/**
* Check whether the query parameters are valid (do not support extra parameters)
*/
export function checkAndRemoveExtraParams(params: ParsedUrlQuery) {
const keys = Object.keys(defaults);
keys.push('token', '?token'); // token is a special parameter
for (const key of Object.keys(params)) {
if (!keys.includes(key)) {
type SupportedParamKey = keyof typeof defaults;

const parsedValidatedParams = {} as Record<SupportedParamKey, unknown>;
for (const [key, value] of Object.entries(params)) {
// If the key is not supported by schema or is not Apify API token, skip it
if (key !== 'token' && !Object.keys(defaults).includes(key)) {
log.warning(`Unknown parameter: ${key}. Supported parameters: ${Object.keys(defaults).join(', ')}`);
delete params[key];
continue;
}
const typedKey = key as SupportedParamKey;
// Schema keys are subset of SupportedParams so we can safely cast
type SchemaKey = keyof typeof inputSchema.properties;

// Parse non-primitive parameters following input schema because querystring doesn't parse objects
if (
!!inputSchema.properties[typedKey as SchemaKey]
&& ['object', 'array'].includes(inputSchema.properties[typedKey as SchemaKey].type)
&& typeof value === 'string'
) {
try {
parsedValidatedParams[typedKey] = JSON.parse(value);
log.debug(`Parsed parameter ${key} from string: ${value} to object`, parsedValidatedParams[typedKey] as object);
} catch (e) {
log.warning(`Failed to parse parameter ${key}, it must be valid JSON. Skipping it: ${e}`);
}
} else {
parsedValidatedParams[typedKey] = value;
}
}

// TODO: We should unify the type for parameters to single source,
// now we have ParsedUrlQuery, Input and SupportedParams
return parsedValidatedParams as ParsedUrlQuery;
}

export function randomId() {
Expand Down
Loading