Skip to content

Commit

Permalink
feat: add Sitemap.tryCommonNames to check well known sitemap locations (
Browse files Browse the repository at this point in the history
#2311)

Closes #2307

---------

Co-authored-by: Jindřich Bär <jindrichbar@gmail.com>
  • Loading branch information
janbuchar and barjin committed Feb 6, 2024
1 parent 741dbd0 commit 85589f1
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 0 deletions.
21 changes: 21 additions & 0 deletions packages/utils/src/internals/sitemap.ts
Expand Up @@ -114,6 +114,27 @@ export class Sitemap {
return parser;
}

/**
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
* For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsFile} class should be used.
* @param url The domain URL to fetch the sitemap for.
* @param proxyUrl A proxy to be used for fetching the sitemap file.
*/
static async tryCommonNames(url: string, proxyUrl?: string): Promise<Sitemap> {
const sitemapUrls: string[] = [];

const sitemapUrl = new URL(url);
sitemapUrl.search = '';

sitemapUrl.pathname = '/sitemap.xml';
sitemapUrls.push(sitemapUrl.toString());

sitemapUrl.pathname = '/sitemap.txt';
sitemapUrls.push(sitemapUrl.toString());

return Sitemap.load(sitemapUrls, proxyUrl);
}

/**
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
* @param urls sitemap URL(s)
Expand Down
24 changes: 24 additions & 0 deletions packages/utils/test/sitemap.test.ts
Expand Up @@ -65,6 +65,20 @@ describe('Sitemap', () => {
'<A HREF="https://ads.google.com/home/">here</A>.',
'</BODY></HTML>',
].join('\n'))
.get('/sitemap.xml')
.reply(200, [
'<?xml version="1.0" encoding="UTF-8"?>',
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<url>',
'<loc>http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'<url>',
'<loc>http://not-exists.com/catalog?item=81&amp;desc=vacation_maledives</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'</urlset>',
].join('\n'))
.get('/sitemap.txt')
.reply(200, [
'http://not-exists.com/catalog?item=78&desc=vacation_crete',
Expand Down Expand Up @@ -117,6 +131,16 @@ describe('Sitemap', () => {
expect(sitemap.urls).toEqual([]);
});

it('autodetects sitemaps', async () => {
const sitemap = await Sitemap.tryCommonNames('http://not-exists.com/arbitrary_url?search=xyz');
expect(new Set(sitemap.urls)).toEqual(new Set([
'http://not-exists.com/catalog?item=80&desc=vacation_turkey',
'http://not-exists.com/catalog?item=81&desc=vacation_maledives',
'http://not-exists.com/catalog?item=78&desc=vacation_crete',
'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
]));
});

it('handles sitemap.txt correctly', async () => {
const sitemap = await Sitemap.load('http://not-exists.com/sitemap.txt');
expect(new Set(sitemap.urls)).toEqual(new Set([
Expand Down

0 comments on commit 85589f1

Please sign in to comment.