Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/checks/observability/llms-txt-coverage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
getUrlsFromSitemap,
parseSitemapUrls,
} from '../../helpers/get-page-urls.js';
import { isSameSite } from '../../helpers/host-equivalence.js';
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
import {
Expand Down Expand Up @@ -297,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[
return urls.filter((url) => {
try {
const parsed = new URL(url);
if (parsed.origin !== origin) return false;
if (!isSameSite(url, origin)) return false;
if (baseUrlPath && baseUrlPath !== '/') {
if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) {
return false;
Expand Down
38 changes: 17 additions & 21 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
import { MAX_SITEMAP_URLS } from '../constants.js';
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
import { isSameSite } from './host-equivalence.js';
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
import type { CheckContext, DiscoveredFile } from '../types.js';

Expand Down Expand Up @@ -162,17 +163,19 @@ async function walkAggregateLinksWithOriginals(
const omittedTxtUrls: string[] = [];

const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
const isAcceptedOrigin = (url: string): boolean =>
isSameSite(url, ctx.origin) || isSameSite(url, siteOrigin);

for (const entry of entries) {
try {
const parsed = new URL(entry.url);
if (/\.txt$/i.test(parsed.pathname)) {
// .txt files are either aggregate indexes to walk (same origin)
// or external resources to skip — never page URLs themselves
if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
if (isAcceptedOrigin(entry.url)) {
aggregateUrls.push(entry.url);
}
} else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
} else if (isAcceptedOrigin(entry.url)) {
// Only include same-origin page URLs; cross-origin links are
// external resources the site owner doesn't control.
pageUrls.push(entry);
Expand Down Expand Up @@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals(
for (const subEntry of subEntries) {
try {
const parsed = new URL(subEntry.url);
const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin;
if (!isSameOrigin) continue;
if (!isAcceptedOrigin(subEntry.url)) continue;

if (/\.txt$/i.test(parsed.pathname)) {
// Depth-1 .txt link: record as omitted rather than descending
Expand Down Expand Up @@ -739,13 +741,13 @@ export async function getUrlsFromSitemap(

function shouldInclude(url: string): boolean {
try {
const u = new URL(url);
if (u.origin !== matchOrigin) return false;
if (prefixPath) return matchesPathPrefix(url, prefixPath);
return true;
new URL(url);
} catch {
return false;
}
if (!isSameSite(url, matchOrigin)) return false;
if (prefixPath) return matchesPathPrefix(url, prefixPath);
return true;
}

// Collect up to collectLimit URLs before refinement. The cap is applied
Expand Down Expand Up @@ -800,35 +802,29 @@ export async function getUrlsFromSitemap(
return deduplicated.slice(0, maxUrls);
}

function isWwwVariant(hostname1: string, hostname2: string): boolean {
return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
}

/**
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
*
* When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
* the original baseUrl path doesn't apply to the redirected host, so we return the
* effectiveOrigin (a root URL) which makes path filtering a no-op.
*
* When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
* the path structure is preserved, so we transfer the baseUrl's path to the
* effective origin to keep path-prefix filtering active.
* When the redirect stays on the same site (e.g. www-canonicalization or an
* http→https upgrade), the path structure is preserved, so we transfer the
* baseUrl's path to the effective origin to keep path-prefix filtering active.
*/
export function getPathFilterBase(ctx: CheckContext): string {
if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) {
return ctx.baseUrl;
}

try {
const originalHost = new URL(ctx.origin).hostname;
const effectiveHost = new URL(ctx.effectiveOrigin).hostname;
if (isWwwVariant(originalHost, effectiveHost)) {
if (isSameSite(ctx.origin, ctx.effectiveOrigin)) {
try {
const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin;
} catch {
// fall through
}
} catch {
// fall through
}

return ctx.effectiveOrigin;
Expand Down
32 changes: 32 additions & 0 deletions src/helpers/host-equivalence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/**
* Host equivalence: treat `www.host` and `host` as the same site.
*
* Documentation sites mix the two forms in several ways that all need the
* same treatment: redirect classification, sitemap URL filtering, path-filter
* base derivation, and aggregate-link walking. Keeping the rule in one place
* means future tweaks (e.g. recognizing additional canonical prefixes)
* propagate to every site automatically.
*/

/**
* Strip a leading `www.` from a hostname, if present.
*/
export function canonicalHost(host: string): string {
return host.startsWith('www.') ? host.slice(4) : host;
}

/**
* True when two URLs (or origins) represent the same site: same hostname after
* stripping `www.`, and same port. Schemes are deliberately ignored so that
* the canonical http→https upgrade on the same host is not classified as a
* different site.
*/
export function isSameSite(url1: string, url2: string): boolean {
try {
const a = new URL(url1);
const b = new URL(url2);
return a.port === b.port && canonicalHost(a.hostname) === canonicalHost(b.hostname);
} catch {
return false;
}
}
18 changes: 7 additions & 11 deletions src/helpers/to-md-urls.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
/**
* Strip the leading "www." from a hostname, if present.
*/
function stripWww(host: string): string {
return host.startsWith('www.') ? host.slice(4) : host;
}
import { isSameSite } from './host-equivalence.js';

/**
* Returns true if the two URLs have different hosts (i.e. a cross-host redirect).
* A www ↔ bare-domain redirect (e.g. mongodb.com → www.mongodb.com) is NOT
* considered cross-host because every HTTP client and agent follows it.
*
* Returns false for malformed URLs — when we can't classify, default to
* "not cross-host" so we don't penalize on bad inputs.
*/
export function isCrossHostRedirect(originalUrl: string, finalUrl: string): boolean {
try {
const original = new URL(originalUrl);
const final_ = new URL(finalUrl);
if (original.host === final_.host) return false;
// www ↔ bare-domain is same-site, not cross-host
return stripWww(original.host) !== stripWww(final_.host);
new URL(originalUrl);
new URL(finalUrl);
} catch {
return false;
}
return !isSameSite(originalUrl, finalUrl);
}

/**
Expand Down
32 changes: 32 additions & 0 deletions test/unit/checks/llms-txt-coverage.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,38 @@ describe('llms-txt-coverage', () => {
expect(result.details?.sitemapDocPages).toBe(2);
});

test('scopes sitemap URLs across www vs bare-host (issue #83)', async () => {
// swift.org-style: scored URL is `www.host`, but sitemap entries are on bare host.
// Coverage scoping must treat these as same-origin or coverage = 0%.
const wwwHost = 'www.www-cov.local';
const bareHost = 'www-cov.local';
const llmsTxtPages = [`http://${wwwHost}/docs/intro`, `http://${wwwHost}/docs/guide`];
const sitemapPages = [
`http://${bareHost}/docs/intro`,
`http://${bareHost}/docs/guide`,
`http://${bareHost}/docs/extra`,
];

const ctx = makeCtx(wwwHost, llmsTxtPages, '/docs');

server.use(
http.get(
`http://${wwwHost}/robots.txt`,
() => new HttpResponse(`Sitemap: http://${wwwHost}/sitemap.xml`, { status: 200 }),
),
http.get(
`http://${wwwHost}/sitemap.xml`,
() =>
new HttpResponse(makeSitemap(sitemapPages), {
headers: { 'content-type': 'application/xml' },
}),
),
);

const result = await check.run(ctx);
expect(result.details?.sitemapDocPages).toBe(3);
});

test('excludes paths relative to base URL prefix', async () => {
const host = 'basepath-exclude.local';
const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`];
Expand Down
98 changes: 98 additions & 0 deletions test/unit/helpers/get-page-urls.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,104 @@ describe('getPageUrls', () => {
]);
});

it('accepts sitemap URLs published on bare-host when scored URL has www (issue #83)', async () => {
// swift.org-style: scored URL is www.host.local, but the sitemap lists URLs
// on the bare host. Without www-equivalence in the origin filter, every URL
// is discarded and afdocs falls back to single-page sampling.
mockSitemapNotFound(server, 'http://www.www-bare.local/documentation/');
server.use(
http.get(
'http://www.www-bare.local/robots.txt',
() => new HttpResponse('User-agent: *\n', { status: 200 }),
),
http.get(
'http://www.www-bare.local/sitemap.xml',
() =>
new HttpResponse(
`<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://www-bare.local/documentation/intro</loc></url>
<url><loc>http://www-bare.local/documentation/guide</loc></url>
</urlset>`,
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
),
),
);

const ctx = createContext('http://www.www-bare.local/documentation/', { requestDelay: 0 });
const warnings: string[] = [];
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
expect(result).toEqual([
'http://www-bare.local/documentation/intro',
'http://www-bare.local/documentation/guide',
]);
});

it('accepts sitemap URLs published on www-host when scored URL is bare (issue #83)', async () => {
// Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed.
mockSitemapNotFound(server, 'http://bare-www.local');
server.use(
http.get(
'http://bare-www.local/robots.txt',
() => new HttpResponse('User-agent: *\n', { status: 200 }),
),
http.get(
'http://bare-www.local/sitemap.xml',
() =>
new HttpResponse(
`<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://www.bare-www.local/page-1</loc></url>
<url><loc>http://www.bare-www.local/page-2</loc></url>
</urlset>`,
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
),
),
);

const ctx = createContext('http://bare-www.local', { requestDelay: 0 });
const warnings: string[] = [];
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
expect(result).toEqual([
'http://www.bare-www.local/page-1',
'http://www.bare-www.local/page-2',
]);
});

it('still rejects truly cross-host sitemap URLs (but allows scheme upgrade)', async () => {
// www-equivalence does not relax filtering for unrelated hosts. Scheme
// is intentionally ignored: an http→https sitemap entry resolves to the
// same site after the canonical scheme upgrade.
mockSitemapNotFound(server, 'http://strict-host.local');
server.use(
http.get(
'http://strict-host.local/robots.txt',
() => new HttpResponse('User-agent: *\n', { status: 200 }),
),
http.get(
'http://strict-host.local/sitemap.xml',
() =>
new HttpResponse(
`<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://strict-host.local/keep</loc></url>
<url><loc>http://other-host.local/drop</loc></url>
<url><loc>https://strict-host.local/keep-scheme</loc></url>
</urlset>`,
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
),
),
);

const ctx = createContext('http://strict-host.local', { requestDelay: 0 });
const warnings: string[] = [];
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
expect(result).toEqual([
'http://strict-host.local/keep',
'https://strict-host.local/keep-scheme',
]);
});

it('warns and skips gzipped sitemap from robots.txt', async () => {
server.use(
http.get(
Expand Down
51 changes: 51 additions & 0 deletions test/unit/helpers/host-equivalence.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { describe, it, expect } from 'vitest';
import { canonicalHost, isSameSite } from '../../../src/helpers/host-equivalence.js';

describe('canonicalHost', () => {
it('strips a leading www.', () => {
expect(canonicalHost('www.swift.org')).toBe('swift.org');
});

it('leaves bare hosts unchanged', () => {
expect(canonicalHost('swift.org')).toBe('swift.org');
});

it('only strips a leading www., not interior', () => {
expect(canonicalHost('docs.www.example.com')).toBe('docs.www.example.com');
});
});

describe('isSameSite', () => {
it('returns true for identical URLs', () => {
expect(isSameSite('https://example.com/', 'https://example.com/')).toBe(true);
});

it('returns true for www vs bare-host (issue #83)', () => {
expect(isSameSite('https://swift.org/x', 'https://www.swift.org/y')).toBe(true);
expect(isSameSite('https://www.swift.org/x', 'https://swift.org/y')).toBe(true);
});

it('ignores scheme — http→https on the same host is same site', () => {
expect(isSameSite('http://example.com/x', 'https://example.com/x')).toBe(true);
});

it('ignores path, query, and fragment', () => {
expect(isSameSite('https://example.com/a?q=1#x', 'https://example.com/b')).toBe(true);
});

it('returns false for different ports', () => {
expect(isSameSite('https://example.com:8443/', 'https://example.com/')).toBe(false);
});

it('returns false for unrelated hosts', () => {
expect(isSameSite('https://example.com/', 'https://other.com/')).toBe(false);
});

it('returns false for non-www subdomains (e.g. docs)', () => {
expect(isSameSite('https://docs.example.com/', 'https://example.com/')).toBe(false);
});

it('returns false for malformed URLs', () => {
expect(isSameSite('not-a-url', 'https://example.com/')).toBe(false);
});
});