From dc2a73ed27397b322655702f5d5207a81d2ff6dc Mon Sep 17 00:00:00 2001 From: Charles Lyding <19598772+clydin@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:13:38 -0400 Subject: [PATCH] refactor(@angular/cli): use streaming HTML parser in search documentation tool The `search_documentation` MCP tool previously used a regular expression and string searching to extract and clean documentation content from fetched HTML. This approach was not robust and could produce incorrect results. It also buffered the entire HTML response in memory before processing. This commit refactors the implementation to use `parse5-html-rewriting-stream`, which is already a dependency in the workspace. The new implementation streams the `fetch` response directly into a single-pass parser that simultaneously extracts the `
` element's content and strips all HTML tags. This change makes the parsing more reliable, efficient, and memory-friendly. --- packages/angular/cli/BUILD.bazel | 1 + packages/angular/cli/package.json | 1 + .../cli/src/commands/mcp/tools/doc-search.ts | 88 ++++++++++--------- pnpm-lock.yaml | 3 + 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/packages/angular/cli/BUILD.bazel b/packages/angular/cli/BUILD.bazel index b0a4a1dee0ea..6cbb09b36c31 100644 --- a/packages/angular/cli/BUILD.bazel +++ b/packages/angular/cli/BUILD.bazel @@ -58,6 +58,7 @@ ts_project( ":node_modules/jsonc-parser", ":node_modules/npm-package-arg", ":node_modules/pacote", + ":node_modules/parse5-html-rewriting-stream", ":node_modules/resolve", ":node_modules/yargs", ":node_modules/zod", diff --git a/packages/angular/cli/package.json b/packages/angular/cli/package.json index 857c2a44a9a6..6cf48fadbb03 100644 --- a/packages/angular/cli/package.json +++ b/packages/angular/cli/package.json @@ -36,6 +36,7 @@ "listr2": "9.0.4", "npm-package-arg": "13.0.1", "pacote": "21.0.3", + "parse5-html-rewriting-stream": "8.0.0", "resolve": "1.22.10", "semver": "7.7.3", "yargs": "18.0.0", diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts index 4c6831dbbaa0..e57d50c6f500 100644 --- a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts +++ b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts @@ -8,6 +8,7 @@ import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch'; import { createDecipheriv } from 'node:crypto'; +import { Readable } from 'node:stream'; import { z } from 'zod'; import { at, iv, k1 } from '../constants'; import { McpToolContext, declareTool } from './tool-registry'; @@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) { // Only fetch content from angular.dev if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) { const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - const mainContent = extractMainContent(html); - if (mainContent) { - topContent = stripHtml(mainContent); - } + if (response.ok && response.body) { + topContent = await extractMainContent( + Readable.fromWeb(response.body, { encoding: 'utf-8' }), + ); } } } catch (e) { @@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) { } /** - * Strips HTML tags from a string using a regular expression. + * Extracts the text content of the `
` element by streaming an HTML response. * - * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however, - * appropriate for this tool's specific use case because its input is always from a - * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM). - * - * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching - * an incomplete tag (e.g., `` element, or `undefined` if not found. */ -function stripHtml(html: string): string { - return html - .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .trim(); -} +async function extractMainContent(htmlStream: Readable): Promise { + const { RewritingStream } = await import('parse5-html-rewriting-stream'); -/** - * Extracts the content of the `
` element from an HTML string. - * - * @param html The HTML content of a page. - * @returns The content of the `
` element, or `undefined` if not found. - */ -function extractMainContent(html: string): string | undefined { - const mainTagStart = html.indexOf(' { + if (tag.tagName === 'main') { + inMainElement = true; + mainTagFound = true; + } + }); + + rewriter.on('endTag', (tag) => { + if (tag.tagName === 'main') { + inMainElement = false; + } + }); - const mainTagEnd = html.lastIndexOf('
'); - if (mainTagEnd <= mainTagStart) { - return undefined; - } + // Only capture text content, and only when inside the
element. + rewriter.on('text', (text) => { + if (inMainElement) { + mainTextContent += text.text; + } + }); + + return new Promise((resolve, reject) => { + htmlStream + .pipe(rewriter) + .on('finish', () => { + if (!mainTagFound) { + resolve(undefined); + + return; + } - // Add 7 to include '
' - return html.substring(mainTagStart, mainTagEnd + 7); + resolve(mainTextContent.trim()); + }) + .on('error', reject); + }); } /** diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3b97a55c8d64..4f0f13c4cb26 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -504,6 +504,9 @@ importers: pacote: specifier: 21.0.3 version: 21.0.3 + parse5-html-rewriting-stream: + specifier: 8.0.0 + version: 8.0.0 resolve: specifier: 1.22.10 version: 1.22.10