Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt-valid.js';
import { MAX_SITEMAP_URLS } from '../constants.js';
import { isNonPageUrl } from './to-md-urls.js';
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
import type { CheckContext, DiscoveredFile } from '../types.js';

/**
Expand Down Expand Up @@ -44,18 +44,28 @@ export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise<strin
return walkAggregateLinks(ctx, urls);
}

/**
* Normalize a discovered page URL: convert .md/.mdx URLs to their HTML
* equivalent so that llms.txt entries like `/docs/guide/index.md` deduplicate
* against sitemap entries like `/docs/guide/`. Markdown-specific checks are
* unaffected because they derive .md candidates from HTML URLs via toMdUrls().
*/
function normalizePageUrl(url: string): string {
return isMdUrl(url) ? toHtmlUrl(url) : url;
}

function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] {
const urls = new Set<string>();
for (const file of files) {
const links = extractMarkdownLinks(file.content);
for (const link of links) {
if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
urls.add(link.url);
urls.add(normalizePageUrl(link.url));
} else if (link.url.startsWith('/')) {
// Resolve root-relative URLs against the source file's origin
try {
const base = new URL(file.url);
urls.add(new URL(link.url, base.origin).toString());
urls.add(normalizePageUrl(new URL(link.url, base.origin).toString()));
} catch {
// Skip malformed URLs
}
Expand Down Expand Up @@ -91,10 +101,10 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<st
} else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
// Only include same-origin page URLs; cross-origin links are
// external resources the site owner doesn't control.
pageUrls.push(url);
pageUrls.push(normalizePageUrl(url));
}
} catch {
pageUrls.push(url);
pageUrls.push(normalizePageUrl(url));
}
}

Expand Down
44 changes: 41 additions & 3 deletions test/unit/helpers/get-page-urls.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1191,9 +1191,10 @@ describe('getPageUrls', () => {

const ctx = makeCtx('http://walk-test.local', rootContent);
const result = await getPageUrls(ctx);
expect(result.urls).toContain('http://walk-test.local/workers/guide/index.md');
expect(result.urls).toContain('http://walk-test.local/workers/api/index.md');
expect(result.urls).toContain('http://walk-test.local/cache/overview/index.md');
// .md URLs from llms.txt are normalized to their HTML equivalents
expect(result.urls).toContain('http://walk-test.local/workers/guide/');
expect(result.urls).toContain('http://walk-test.local/workers/api/');
expect(result.urls).toContain('http://walk-test.local/cache/overview/');
expect(result.urls).toHaveLength(3);
});

Expand Down Expand Up @@ -1296,6 +1297,43 @@ describe('getPageUrls', () => {
expect(result.urls).toEqual(['http://walk-empty.local/docs/page']);
});

// ── .md URL normalization ──

it('normalizes .md URLs from llms.txt to HTML equivalents', async () => {
const content = `# Docs\n- [Guide](http://md-norm.local/docs/guide/index.md): Guide\n- [API](http://md-norm.local/docs/api.md): API\n`;
const ctx = makeCtx('http://md-norm.local', content);
const result = await getPageUrls(ctx);
expect(result.urls).toContain('http://md-norm.local/docs/guide/');
expect(result.urls).toContain('http://md-norm.local/docs/api');
expect(result.urls).not.toContain('http://md-norm.local/docs/guide/index.md');
expect(result.urls).not.toContain('http://md-norm.local/docs/api.md');
});

it('deduplicates .md and HTML URLs for the same page', async () => {
// llms.txt has .md URL, sitemap has HTML URL for the same page
const content = `# Docs\n- [Guide](http://md-dedup.local/docs/guide/index.md): Guide\n`;
const ctx = makeCtx('http://md-dedup.local', content);

server.use(
http.get('http://md-dedup.local/robots.txt', () => new HttpResponse('', { status: 404 })),
http.get(
'http://md-dedup.local/sitemap.xml',
() =>
new HttpResponse(
`<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>http://md-dedup.local/docs/guide/</loc></url><url><loc>http://md-dedup.local/docs/other/</loc></url></urlset>`,
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
),
),
);

const result = await getPageUrls(ctx);
// /docs/guide/ should appear only once (not twice for .md + HTML)
const guideCount = result.urls.filter((u) => u === 'http://md-dedup.local/docs/guide/').length;
expect(guideCount).toBe(1);
// /docs/other/ from sitemap should still be present
expect(result.urls).toContain('http://md-dedup.local/docs/other/');
});

// ── Direct llms.txt fetch (standalone mode) ──

it('fetches llms.txt directly when llms-txt-exists has not run', async () => {
Expand Down
Loading