diff --git a/src/checks/observability/llms-txt-coverage.ts b/src/checks/observability/llms-txt-coverage.ts index 2c6704d..61a8c51 100644 --- a/src/checks/observability/llms-txt-coverage.ts +++ b/src/checks/observability/llms-txt-coverage.ts @@ -4,6 +4,7 @@ import { getUrlsFromSitemap, parseSitemapUrls, } from '../../helpers/get-page-urls.js'; +import { isSameSite } from '../../helpers/host-equivalence.js'; import { isNonPageUrl } from '../../helpers/to-md-urls.js'; import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js'; import { @@ -297,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[ return urls.filter((url) => { try { const parsed = new URL(url); - if (parsed.origin !== origin) return false; + if (!isSameSite(url, origin)) return false; if (baseUrlPath && baseUrlPath !== '/') { if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) { return false; diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 96bf66a..52064dd 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt import { MAX_SITEMAP_URLS } from '../constants.js'; import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js'; import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js'; +import { isSameSite } from './host-equivalence.js'; import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js'; import type { CheckContext, DiscoveredFile } from '../types.js'; @@ -162,6 +163,8 @@ async function walkAggregateLinksWithOriginals( const omittedTxtUrls: string[] = []; const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; + const isAcceptedOrigin = (url: string): boolean => + isSameSite(url, ctx.origin) || isSameSite(url, siteOrigin); for (const entry of entries) { try { @@ -169,10 +172,10 @@ async function walkAggregateLinksWithOriginals( if (/\.txt$/i.test(parsed.pathname)) { // .txt files are either aggregate indexes to walk (same origin) // or external resources to skip — never page URLs themselves - if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { + if (isAcceptedOrigin(entry.url)) { aggregateUrls.push(entry.url); } - } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { + } else if (isAcceptedOrigin(entry.url)) { // Only include same-origin page URLs; cross-origin links are // external resources the site owner doesn't control. pageUrls.push(entry); @@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals( for (const subEntry of subEntries) { try { const parsed = new URL(subEntry.url); - const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin; - if (!isSameOrigin) continue; + if (!isAcceptedOrigin(subEntry.url)) continue; if (/\.txt$/i.test(parsed.pathname)) { // Depth-1 .txt link: record as omitted rather than descending @@ -739,13 +741,13 @@ export async function getUrlsFromSitemap( function shouldInclude(url: string): boolean { try { - const u = new URL(url); - if (u.origin !== matchOrigin) return false; - if (prefixPath) return matchesPathPrefix(url, prefixPath); - return true; + new URL(url); } catch { return false; } + if (!isSameSite(url, matchOrigin)) return false; + if (prefixPath) return matchesPathPrefix(url, prefixPath); + return true; } // Collect up to collectLimit URLs before refinement. The cap is applied @@ -800,10 +802,6 @@ export async function getUrlsFromSitemap( return deduplicated.slice(0, maxUrls); } -function isWwwVariant(hostname1: string, hostname2: string): boolean { - return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`; -} - /** * Get the base URL for path-prefix filtering, accounting for cross-host redirects. * @@ -811,24 +809,22 @@ function isWwwVariant(hostname1: string, hostname2: string): boolean { * the original baseUrl path doesn't apply to the redirected host, so we return the * effectiveOrigin (a root URL) which makes path filtering a no-op. * - * When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com), - * the path structure is preserved, so we transfer the baseUrl's path to the - * effective origin to keep path-prefix filtering active. + * When the redirect stays on the same site (e.g. www-canonicalization or an + * http→https upgrade), the path structure is preserved, so we transfer the + * baseUrl's path to the effective origin to keep path-prefix filtering active. */ export function getPathFilterBase(ctx: CheckContext): string { if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) { return ctx.baseUrl; } - try { - const originalHost = new URL(ctx.origin).hostname; - const effectiveHost = new URL(ctx.effectiveOrigin).hostname; - if (isWwwVariant(originalHost, effectiveHost)) { + if (isSameSite(ctx.origin, ctx.effectiveOrigin)) { + try { const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, ''); return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin; + } catch { + // fall through } - } catch { - // fall through } return ctx.effectiveOrigin; diff --git a/src/helpers/host-equivalence.ts b/src/helpers/host-equivalence.ts new file mode 100644 index 0000000..0b42668 --- /dev/null +++ b/src/helpers/host-equivalence.ts @@ -0,0 +1,32 @@ +/** + * Host equivalence: treat `www.host` and `host` as the same site. + * + * Documentation sites mix the two forms in several ways that all need the + * same treatment: redirect classification, sitemap URL filtering, path-filter + * base derivation, and aggregate-link walking. Keeping the rule in one place + * means future tweaks (e.g. recognizing additional canonical prefixes) + * propagate to every site automatically. + */ + +/** + * Strip a leading `www.` from a hostname, if present. + */ +export function canonicalHost(host: string): string { + return host.startsWith('www.') ? host.slice(4) : host; +} + +/** + * True when two URLs (or origins) represent the same site: same hostname after + * stripping `www.`, and same port. Schemes are deliberately ignored so that + * the canonical http→https upgrade on the same host is not classified as a + * different site. + */ +export function isSameSite(url1: string, url2: string): boolean { + try { + const a = new URL(url1); + const b = new URL(url2); + return a.port === b.port && canonicalHost(a.hostname) === canonicalHost(b.hostname); + } catch { + return false; + } +} diff --git a/src/helpers/to-md-urls.ts b/src/helpers/to-md-urls.ts index 78e9cfd..0513c10 100644 --- a/src/helpers/to-md-urls.ts +++ b/src/helpers/to-md-urls.ts @@ -1,25 +1,21 @@ -/** - * Strip the leading "www." from a hostname, if present. - */ -function stripWww(host: string): string { - return host.startsWith('www.') ? host.slice(4) : host; -} +import { isSameSite } from './host-equivalence.js'; /** * Returns true if the two URLs have different hosts (i.e. a cross-host redirect). * A www ↔ bare-domain redirect (e.g. mongodb.com → www.mongodb.com) is NOT * considered cross-host because every HTTP client and agent follows it. + * + * Returns false for malformed URLs — when we can't classify, default to + * "not cross-host" so we don't penalize on bad inputs. */ export function isCrossHostRedirect(originalUrl: string, finalUrl: string): boolean { try { - const original = new URL(originalUrl); - const final_ = new URL(finalUrl); - if (original.host === final_.host) return false; - // www ↔ bare-domain is same-site, not cross-host - return stripWww(original.host) !== stripWww(final_.host); + new URL(originalUrl); + new URL(finalUrl); } catch { return false; } + return !isSameSite(originalUrl, finalUrl); } /** diff --git a/test/unit/checks/llms-txt-coverage.test.ts b/test/unit/checks/llms-txt-coverage.test.ts index 1a6b6df..7e70491 100644 --- a/test/unit/checks/llms-txt-coverage.test.ts +++ b/test/unit/checks/llms-txt-coverage.test.ts @@ -708,6 +708,38 @@ describe('llms-txt-coverage', () => { expect(result.details?.sitemapDocPages).toBe(2); }); + test('scopes sitemap URLs across www vs bare-host (issue #83)', async () => { + // swift.org-style: scored URL is `www.host`, but sitemap entries are on bare host. + // Coverage scoping must treat these as same-origin or coverage = 0%. + const wwwHost = 'www.www-cov.local'; + const bareHost = 'www-cov.local'; + const llmsTxtPages = [`http://${wwwHost}/docs/intro`, `http://${wwwHost}/docs/guide`]; + const sitemapPages = [ + `http://${bareHost}/docs/intro`, + `http://${bareHost}/docs/guide`, + `http://${bareHost}/docs/extra`, + ]; + + const ctx = makeCtx(wwwHost, llmsTxtPages, '/docs'); + + server.use( + http.get( + `http://${wwwHost}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${wwwHost}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${wwwHost}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.details?.sitemapDocPages).toBe(3); + }); + test('excludes paths relative to base URL prefix', async () => { const host = 'basepath-exclude.local'; const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`]; diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 6b1d252..96b403a 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -1207,6 +1207,104 @@ describe('getPageUrls', () => { ]); }); + it('accepts sitemap URLs published on bare-host when scored URL has www (issue #83)', async () => { + // swift.org-style: scored URL is www.host.local, but the sitemap lists URLs + // on the bare host. Without www-equivalence in the origin filter, every URL + // is discarded and afdocs falls back to single-page sampling. + mockSitemapNotFound(server, 'http://www.www-bare.local/documentation/'); + server.use( + http.get( + 'http://www.www-bare.local/robots.txt', + () => new HttpResponse('User-agent: *\n', { status: 200 }), + ), + http.get( + 'http://www.www-bare.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://www-bare.local/documentation/intro + http://www-bare.local/documentation/guide +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const ctx = createContext('http://www.www-bare.local/documentation/', { requestDelay: 0 }); + const warnings: string[] = []; + const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true }); + expect(result).toEqual([ + 'http://www-bare.local/documentation/intro', + 'http://www-bare.local/documentation/guide', + ]); + }); + + it('accepts sitemap URLs published on www-host when scored URL is bare (issue #83)', async () => { + // Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed. + mockSitemapNotFound(server, 'http://bare-www.local'); + server.use( + http.get( + 'http://bare-www.local/robots.txt', + () => new HttpResponse('User-agent: *\n', { status: 200 }), + ), + http.get( + 'http://bare-www.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://www.bare-www.local/page-1 + http://www.bare-www.local/page-2 +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const ctx = createContext('http://bare-www.local', { requestDelay: 0 }); + const warnings: string[] = []; + const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true }); + expect(result).toEqual([ + 'http://www.bare-www.local/page-1', + 'http://www.bare-www.local/page-2', + ]); + }); + + it('still rejects truly cross-host sitemap URLs (but allows scheme upgrade)', async () => { + // www-equivalence does not relax filtering for unrelated hosts. Scheme + // is intentionally ignored: an http→https sitemap entry resolves to the + // same site after the canonical scheme upgrade. + mockSitemapNotFound(server, 'http://strict-host.local'); + server.use( + http.get( + 'http://strict-host.local/robots.txt', + () => new HttpResponse('User-agent: *\n', { status: 200 }), + ), + http.get( + 'http://strict-host.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://strict-host.local/keep + http://other-host.local/drop + https://strict-host.local/keep-scheme +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const ctx = createContext('http://strict-host.local', { requestDelay: 0 }); + const warnings: string[] = []; + const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true }); + expect(result).toEqual([ + 'http://strict-host.local/keep', + 'https://strict-host.local/keep-scheme', + ]); + }); + it('warns and skips gzipped sitemap from robots.txt', async () => { server.use( http.get( diff --git a/test/unit/helpers/host-equivalence.test.ts b/test/unit/helpers/host-equivalence.test.ts new file mode 100644 index 0000000..b48090f --- /dev/null +++ b/test/unit/helpers/host-equivalence.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect } from 'vitest'; +import { canonicalHost, isSameSite } from '../../../src/helpers/host-equivalence.js'; + +describe('canonicalHost', () => { + it('strips a leading www.', () => { + expect(canonicalHost('www.swift.org')).toBe('swift.org'); + }); + + it('leaves bare hosts unchanged', () => { + expect(canonicalHost('swift.org')).toBe('swift.org'); + }); + + it('only strips a leading www., not interior', () => { + expect(canonicalHost('docs.www.example.com')).toBe('docs.www.example.com'); + }); +}); + +describe('isSameSite', () => { + it('returns true for identical URLs', () => { + expect(isSameSite('https://example.com/', 'https://example.com/')).toBe(true); + }); + + it('returns true for www vs bare-host (issue #83)', () => { + expect(isSameSite('https://swift.org/x', 'https://www.swift.org/y')).toBe(true); + expect(isSameSite('https://www.swift.org/x', 'https://swift.org/y')).toBe(true); + }); + + it('ignores scheme — http→https on the same host is same site', () => { + expect(isSameSite('http://example.com/x', 'https://example.com/x')).toBe(true); + }); + + it('ignores path, query, and fragment', () => { + expect(isSameSite('https://example.com/a?q=1#x', 'https://example.com/b')).toBe(true); + }); + + it('returns false for different ports', () => { + expect(isSameSite('https://example.com:8443/', 'https://example.com/')).toBe(false); + }); + + it('returns false for unrelated hosts', () => { + expect(isSameSite('https://example.com/', 'https://other.com/')).toBe(false); + }); + + it('returns false for non-www subdomains (e.g. docs)', () => { + expect(isSameSite('https://docs.example.com/', 'https://example.com/')).toBe(false); + }); + + it('returns false for malformed URLs', () => { + expect(isSameSite('not-a-url', 'https://example.com/')).toBe(false); + }); +});