diff --git a/packages/angular/cli/BUILD.bazel b/packages/angular/cli/BUILD.bazel index b0a4a1dee0ea..6cbb09b36c31 100644 --- a/packages/angular/cli/BUILD.bazel +++ b/packages/angular/cli/BUILD.bazel @@ -58,6 +58,7 @@ ts_project( ":node_modules/jsonc-parser", ":node_modules/npm-package-arg", ":node_modules/pacote", + ":node_modules/parse5-html-rewriting-stream", ":node_modules/resolve", ":node_modules/yargs", ":node_modules/zod", diff --git a/packages/angular/cli/package.json b/packages/angular/cli/package.json index 857c2a44a9a6..6cf48fadbb03 100644 --- a/packages/angular/cli/package.json +++ b/packages/angular/cli/package.json @@ -36,6 +36,7 @@ "listr2": "9.0.4", "npm-package-arg": "13.0.1", "pacote": "21.0.3", + "parse5-html-rewriting-stream": "8.0.0", "resolve": "1.22.10", "semver": "7.7.3", "yargs": "18.0.0", diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts index 4c6831dbbaa0..e57d50c6f500 100644 --- a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts +++ b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts @@ -8,6 +8,7 @@ import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch'; import { createDecipheriv } from 'node:crypto'; +import { Readable } from 'node:stream'; import { z } from 'zod'; import { at, iv, k1 } from '../constants'; import { McpToolContext, declareTool } from './tool-registry'; @@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) { // Only fetch content from angular.dev if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) { const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - const mainContent = extractMainContent(html); - if (mainContent) { - topContent = stripHtml(mainContent); - } + if (response.ok && response.body) { + topContent = await extractMainContent( + Readable.fromWeb(response.body, { encoding: 'utf-8' }), + ); } } } catch (e) { @@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) { } /** - * Strips HTML tags from a string using a regular expression. + * Extracts the text content of the `
` element by streaming an HTML response. * - * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however, - * appropriate for this tool's specific use case because its input is always from a - * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM). - * - * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching - * an incomplete tag (e.g., `` element, or `undefined` if not found. */ -function stripHtml(html: string): string { - return html - .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .trim(); -} +async function extractMainContent(htmlStream: Readable): Promise { + const { RewritingStream } = await import('parse5-html-rewriting-stream'); -/** - * Extracts the content of the `
` element from an HTML string. - * - * @param html The HTML content of a page. - * @returns The content of the `
` element, or `undefined` if not found. - */ -function extractMainContent(html: string): string | undefined { - const mainTagStart = html.indexOf(' { + if (tag.tagName === 'main') { + inMainElement = true; + mainTagFound = true; + } + }); + + rewriter.on('endTag', (tag) => { + if (tag.tagName === 'main') { + inMainElement = false; + } + }); - const mainTagEnd = html.lastIndexOf('
'); - if (mainTagEnd <= mainTagStart) { - return undefined; - } + // Only capture text content, and only when inside the
element. + rewriter.on('text', (text) => { + if (inMainElement) { + mainTextContent += text.text; + } + }); + + return new Promise((resolve, reject) => { + htmlStream + .pipe(rewriter) + .on('finish', () => { + if (!mainTagFound) { + resolve(undefined); + + return; + } - // Add 7 to include '
' - return html.substring(mainTagStart, mainTagEnd + 7); + resolve(mainTextContent.trim()); + }) + .on('error', reject); + }); } /** diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3b97a55c8d64..4f0f13c4cb26 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -504,6 +504,9 @@ importers: pacote: specifier: 21.0.3 version: 21.0.3 + parse5-html-rewriting-stream: + specifier: 8.0.0 + version: 8.0.0 resolve: specifier: 1.22.10 version: 1.22.10