diff --git a/src/checks/observability/llms-txt-coverage.ts b/src/checks/observability/llms-txt-coverage.ts
index 2c6704d..61a8c51 100644
--- a/src/checks/observability/llms-txt-coverage.ts
+++ b/src/checks/observability/llms-txt-coverage.ts
@@ -4,6 +4,7 @@ import {
getUrlsFromSitemap,
parseSitemapUrls,
} from '../../helpers/get-page-urls.js';
+import { isSameSite } from '../../helpers/host-equivalence.js';
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
import {
@@ -297,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[
return urls.filter((url) => {
try {
const parsed = new URL(url);
- if (parsed.origin !== origin) return false;
+ if (!isSameSite(url, origin)) return false;
if (baseUrlPath && baseUrlPath !== '/') {
if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) {
return false;
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
index 96bf66a..52064dd 100644
--- a/src/helpers/get-page-urls.ts
+++ b/src/helpers/get-page-urls.ts
@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
import { MAX_SITEMAP_URLS } from '../constants.js';
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
+import { isSameSite } from './host-equivalence.js';
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
import type { CheckContext, DiscoveredFile } from '../types.js';
@@ -162,6 +163,8 @@ async function walkAggregateLinksWithOriginals(
const omittedTxtUrls: string[] = [];
const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
+ const isAcceptedOrigin = (url: string): boolean =>
+ isSameSite(url, ctx.origin) || isSameSite(url, siteOrigin);
for (const entry of entries) {
try {
@@ -169,10 +172,10 @@ async function walkAggregateLinksWithOriginals(
if (/\.txt$/i.test(parsed.pathname)) {
// .txt files are either aggregate indexes to walk (same origin)
// or external resources to skip — never page URLs themselves
- if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
+ if (isAcceptedOrigin(entry.url)) {
aggregateUrls.push(entry.url);
}
- } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
+ } else if (isAcceptedOrigin(entry.url)) {
// Only include same-origin page URLs; cross-origin links are
// external resources the site owner doesn't control.
pageUrls.push(entry);
@@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals(
for (const subEntry of subEntries) {
try {
const parsed = new URL(subEntry.url);
- const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin;
- if (!isSameOrigin) continue;
+ if (!isAcceptedOrigin(subEntry.url)) continue;
if (/\.txt$/i.test(parsed.pathname)) {
// Depth-1 .txt link: record as omitted rather than descending
@@ -739,13 +741,13 @@ export async function getUrlsFromSitemap(
function shouldInclude(url: string): boolean {
try {
- const u = new URL(url);
- if (u.origin !== matchOrigin) return false;
- if (prefixPath) return matchesPathPrefix(url, prefixPath);
- return true;
+ new URL(url);
} catch {
return false;
}
+ if (!isSameSite(url, matchOrigin)) return false;
+ if (prefixPath) return matchesPathPrefix(url, prefixPath);
+ return true;
}
// Collect up to collectLimit URLs before refinement. The cap is applied
@@ -800,10 +802,6 @@ export async function getUrlsFromSitemap(
return deduplicated.slice(0, maxUrls);
}
-function isWwwVariant(hostname1: string, hostname2: string): boolean {
- return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
-}
-
/**
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
*
@@ -811,24 +809,22 @@ function isWwwVariant(hostname1: string, hostname2: string): boolean {
* the original baseUrl path doesn't apply to the redirected host, so we return the
* effectiveOrigin (a root URL) which makes path filtering a no-op.
*
- * When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
- * the path structure is preserved, so we transfer the baseUrl's path to the
- * effective origin to keep path-prefix filtering active.
+ * When the redirect stays on the same site (e.g. www-canonicalization or an
+ * http→https upgrade), the path structure is preserved, so we transfer the
+ * baseUrl's path to the effective origin to keep path-prefix filtering active.
*/
export function getPathFilterBase(ctx: CheckContext): string {
if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) {
return ctx.baseUrl;
}
- try {
- const originalHost = new URL(ctx.origin).hostname;
- const effectiveHost = new URL(ctx.effectiveOrigin).hostname;
- if (isWwwVariant(originalHost, effectiveHost)) {
+ if (isSameSite(ctx.origin, ctx.effectiveOrigin)) {
+ try {
const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin;
+ } catch {
+ // fall through
}
- } catch {
- // fall through
}
return ctx.effectiveOrigin;
diff --git a/src/helpers/host-equivalence.ts b/src/helpers/host-equivalence.ts
new file mode 100644
index 0000000..0b42668
--- /dev/null
+++ b/src/helpers/host-equivalence.ts
@@ -0,0 +1,32 @@
+/**
+ * Host equivalence: treat `www.host` and `host` as the same site.
+ *
+ * Documentation sites mix the two forms in several ways that all need the
+ * same treatment: redirect classification, sitemap URL filtering, path-filter
+ * base derivation, and aggregate-link walking. Keeping the rule in one place
+ * means future tweaks (e.g. recognizing additional canonical prefixes)
+ * propagate to every site automatically.
+ */
+
+/**
+ * Strip a leading `www.` from a hostname, if present.
+ */
+export function canonicalHost(host: string): string {
+ return host.startsWith('www.') ? host.slice(4) : host;
+}
+
+/**
+ * True when two URLs (or origins) represent the same site: same hostname after
+ * stripping `www.`, and same port. Schemes are deliberately ignored so that
+ * the canonical http→https upgrade on the same host is not classified as a
+ * different site.
+ */
+export function isSameSite(url1: string, url2: string): boolean {
+ try {
+ const a = new URL(url1);
+ const b = new URL(url2);
+ return a.port === b.port && canonicalHost(a.hostname) === canonicalHost(b.hostname);
+ } catch {
+ return false;
+ }
+}
diff --git a/src/helpers/to-md-urls.ts b/src/helpers/to-md-urls.ts
index 78e9cfd..0513c10 100644
--- a/src/helpers/to-md-urls.ts
+++ b/src/helpers/to-md-urls.ts
@@ -1,25 +1,21 @@
-/**
- * Strip the leading "www." from a hostname, if present.
- */
-function stripWww(host: string): string {
- return host.startsWith('www.') ? host.slice(4) : host;
-}
+import { isSameSite } from './host-equivalence.js';
/**
* Returns true if the two URLs have different hosts (i.e. a cross-host redirect).
* A www ↔ bare-domain redirect (e.g. mongodb.com → www.mongodb.com) is NOT
* considered cross-host because every HTTP client and agent follows it.
+ *
+ * Returns false for malformed URLs — when we can't classify, default to
+ * "not cross-host" so we don't penalize on bad inputs.
*/
export function isCrossHostRedirect(originalUrl: string, finalUrl: string): boolean {
try {
- const original = new URL(originalUrl);
- const final_ = new URL(finalUrl);
- if (original.host === final_.host) return false;
- // www ↔ bare-domain is same-site, not cross-host
- return stripWww(original.host) !== stripWww(final_.host);
+ new URL(originalUrl);
+ new URL(finalUrl);
} catch {
return false;
}
+ return !isSameSite(originalUrl, finalUrl);
}
/**
diff --git a/test/unit/checks/llms-txt-coverage.test.ts b/test/unit/checks/llms-txt-coverage.test.ts
index 1a6b6df..7e70491 100644
--- a/test/unit/checks/llms-txt-coverage.test.ts
+++ b/test/unit/checks/llms-txt-coverage.test.ts
@@ -708,6 +708,38 @@ describe('llms-txt-coverage', () => {
expect(result.details?.sitemapDocPages).toBe(2);
});
+ test('scopes sitemap URLs across www vs bare-host (issue #83)', async () => {
+ // swift.org-style: scored URL is `www.host`, but sitemap entries are on bare host.
+ // Coverage scoping must treat these as same-origin or coverage = 0%.
+ const wwwHost = 'www.www-cov.local';
+ const bareHost = 'www-cov.local';
+ const llmsTxtPages = [`http://${wwwHost}/docs/intro`, `http://${wwwHost}/docs/guide`];
+ const sitemapPages = [
+ `http://${bareHost}/docs/intro`,
+ `http://${bareHost}/docs/guide`,
+ `http://${bareHost}/docs/extra`,
+ ];
+
+ const ctx = makeCtx(wwwHost, llmsTxtPages, '/docs');
+
+ server.use(
+ http.get(
+ `http://${wwwHost}/robots.txt`,
+ () => new HttpResponse(`Sitemap: http://${wwwHost}/sitemap.xml`, { status: 200 }),
+ ),
+ http.get(
+ `http://${wwwHost}/sitemap.xml`,
+ () =>
+ new HttpResponse(makeSitemap(sitemapPages), {
+ headers: { 'content-type': 'application/xml' },
+ }),
+ ),
+ );
+
+ const result = await check.run(ctx);
+ expect(result.details?.sitemapDocPages).toBe(3);
+ });
+
test('excludes paths relative to base URL prefix', async () => {
const host = 'basepath-exclude.local';
const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`];
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
index 6b1d252..96b403a 100644
--- a/test/unit/helpers/get-page-urls.test.ts
+++ b/test/unit/helpers/get-page-urls.test.ts
@@ -1207,6 +1207,104 @@ describe('getPageUrls', () => {
]);
});
+ it('accepts sitemap URLs published on bare-host when scored URL has www (issue #83)', async () => {
+ // swift.org-style: scored URL is www.host.local, but the sitemap lists URLs
+ // on the bare host. Without www-equivalence in the origin filter, every URL
+ // is discarded and afdocs falls back to single-page sampling.
+ mockSitemapNotFound(server, 'http://www.www-bare.local/documentation/');
+ server.use(
+ http.get(
+ 'http://www.www-bare.local/robots.txt',
+ () => new HttpResponse('User-agent: *\n', { status: 200 }),
+ ),
+ http.get(
+ 'http://www.www-bare.local/sitemap.xml',
+ () =>
+ new HttpResponse(
+ `
+
+ http://www-bare.local/documentation/intro
+ http://www-bare.local/documentation/guide
+`,
+ { status: 200, headers: { 'Content-Type': 'application/xml' } },
+ ),
+ ),
+ );
+
+ const ctx = createContext('http://www.www-bare.local/documentation/', { requestDelay: 0 });
+ const warnings: string[] = [];
+ const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
+ expect(result).toEqual([
+ 'http://www-bare.local/documentation/intro',
+ 'http://www-bare.local/documentation/guide',
+ ]);
+ });
+
+ it('accepts sitemap URLs published on www-host when scored URL is bare (issue #83)', async () => {
+ // Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed.
+ mockSitemapNotFound(server, 'http://bare-www.local');
+ server.use(
+ http.get(
+ 'http://bare-www.local/robots.txt',
+ () => new HttpResponse('User-agent: *\n', { status: 200 }),
+ ),
+ http.get(
+ 'http://bare-www.local/sitemap.xml',
+ () =>
+ new HttpResponse(
+ `
+
+ http://www.bare-www.local/page-1
+ http://www.bare-www.local/page-2
+`,
+ { status: 200, headers: { 'Content-Type': 'application/xml' } },
+ ),
+ ),
+ );
+
+ const ctx = createContext('http://bare-www.local', { requestDelay: 0 });
+ const warnings: string[] = [];
+ const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
+ expect(result).toEqual([
+ 'http://www.bare-www.local/page-1',
+ 'http://www.bare-www.local/page-2',
+ ]);
+ });
+
+ it('still rejects truly cross-host sitemap URLs (but allows scheme upgrade)', async () => {
+ // www-equivalence does not relax filtering for unrelated hosts. Scheme
+ // is intentionally ignored: an http→https sitemap entry resolves to the
+ // same site after the canonical scheme upgrade.
+ mockSitemapNotFound(server, 'http://strict-host.local');
+ server.use(
+ http.get(
+ 'http://strict-host.local/robots.txt',
+ () => new HttpResponse('User-agent: *\n', { status: 200 }),
+ ),
+ http.get(
+ 'http://strict-host.local/sitemap.xml',
+ () =>
+ new HttpResponse(
+ `
+
+ http://strict-host.local/keep
+ http://other-host.local/drop
+ https://strict-host.local/keep-scheme
+`,
+ { status: 200, headers: { 'Content-Type': 'application/xml' } },
+ ),
+ ),
+ );
+
+ const ctx = createContext('http://strict-host.local', { requestDelay: 0 });
+ const warnings: string[] = [];
+ const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
+ expect(result).toEqual([
+ 'http://strict-host.local/keep',
+ 'https://strict-host.local/keep-scheme',
+ ]);
+ });
+
it('warns and skips gzipped sitemap from robots.txt', async () => {
server.use(
http.get(
diff --git a/test/unit/helpers/host-equivalence.test.ts b/test/unit/helpers/host-equivalence.test.ts
new file mode 100644
index 0000000..b48090f
--- /dev/null
+++ b/test/unit/helpers/host-equivalence.test.ts
@@ -0,0 +1,51 @@
+import { describe, it, expect } from 'vitest';
+import { canonicalHost, isSameSite } from '../../../src/helpers/host-equivalence.js';
+
+describe('canonicalHost', () => {
+ it('strips a leading www.', () => {
+ expect(canonicalHost('www.swift.org')).toBe('swift.org');
+ });
+
+ it('leaves bare hosts unchanged', () => {
+ expect(canonicalHost('swift.org')).toBe('swift.org');
+ });
+
+ it('only strips a leading www., not interior', () => {
+ expect(canonicalHost('docs.www.example.com')).toBe('docs.www.example.com');
+ });
+});
+
+describe('isSameSite', () => {
+ it('returns true for identical URLs', () => {
+ expect(isSameSite('https://example.com/', 'https://example.com/')).toBe(true);
+ });
+
+ it('returns true for www vs bare-host (issue #83)', () => {
+ expect(isSameSite('https://swift.org/x', 'https://www.swift.org/y')).toBe(true);
+ expect(isSameSite('https://www.swift.org/x', 'https://swift.org/y')).toBe(true);
+ });
+
+ it('ignores scheme — http→https on the same host is same site', () => {
+ expect(isSameSite('http://example.com/x', 'https://example.com/x')).toBe(true);
+ });
+
+ it('ignores path, query, and fragment', () => {
+ expect(isSameSite('https://example.com/a?q=1#x', 'https://example.com/b')).toBe(true);
+ });
+
+ it('returns false for different ports', () => {
+ expect(isSameSite('https://example.com:8443/', 'https://example.com/')).toBe(false);
+ });
+
+ it('returns false for unrelated hosts', () => {
+ expect(isSameSite('https://example.com/', 'https://other.com/')).toBe(false);
+ });
+
+ it('returns false for non-www subdomains (e.g. docs)', () => {
+ expect(isSameSite('https://docs.example.com/', 'https://example.com/')).toBe(false);
+ });
+
+ it('returns false for malformed URLs', () => {
+ expect(isSameSite('not-a-url', 'https://example.com/')).toBe(false);
+ });
+});