Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: ignore invalid URLs in enqueueLinks in browser crawlers #1803

Merged
merged 3 commits into from
Mar 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {
BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
BasicCrawler,
RequestState,
tryAbsoluteURL,
} from '@crawlee/basic';
import type {
BrowserController,
Expand Down Expand Up @@ -723,8 +724,10 @@ async function extractUrlsFromPage(page: { $$eval: Function }, selector: string,
throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. `
+ 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.');
}

return baseUrl
? (new URL(href, baseUrl)).href
? tryAbsoluteURL(href, baseUrl)
: href;
});
})
.filter((href: string | undefined) => !!href);
}
11 changes: 2 additions & 9 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import type {
RequestQueue,
Configuration,
} from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering, tryAbsoluteURL } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
Expand Down Expand Up @@ -224,15 +224,8 @@ function extractUrlsFromCheerio($: cheerio.CheerioAPI, selector: string, baseUrl
throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. `
+ 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.');
}
const tryAbsolute = () => {
try {
return (new URL(href, baseUrl)).href;
} catch {
return undefined;
}
};
return baseUrl
? tryAbsolute()
? tryAbsoluteURL(href, baseUrl)
: href;
})
.filter((href) => !!href) as string[];
Expand Down
11 changes: 11 additions & 0 deletions packages/core/src/enqueue_links/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ export function createRequestOptions(
});
}

/**
* Helper function used to validate URLs used when extracting URLs from a page
*/
export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined {
try {
return (new URL(href, baseUrl)).href;
} catch {
return undefined;
}
}

/**
* Takes an Apify {@apilink RequestOptions} object and changes its attributes in a desired way. This user-function is used
* {@apilink enqueueLinks} to modify requests before enqueuing them.
Expand Down
9 changes: 2 additions & 7 deletions packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import type {
RequestQueue,
Configuration,
} from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering, tryAbsoluteURL } from '@crawlee/http';
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { concatStreamToBuffer } from '@apify/utilities';
import type { DOMWindow } from 'jsdom';
Expand Down Expand Up @@ -265,12 +265,7 @@ function extractUrlsFromWindow(window: DOMWindow, selector: string, baseUrl: str
if (href === undefined) {
return undefined;
}

try {
return (new URL(href, baseUrl)).href;
} catch {
return undefined;
}
return tryAbsoluteURL(href, baseUrl);
})
.filter((href) => href !== undefined && href !== '') as string[];
}
Expand Down
1 change: 1 addition & 0 deletions test/core/enqueue_links/enqueue_links.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const HTML = `
<a href="/x/absolutepath">This is a relative link.</a>
<a href="y/relativepath">This is a relative link.</a>
<a href="//example.absolute.com/hello">This is a link to a different subdomain</a>
<a href="http://">Invalid URL link, this needs to be ignored</a>
</body>
</html>
`;
Expand Down