From dc2a73ed27397b322655702f5d5207a81d2ff6dc Mon Sep 17 00:00:00 2001
From: Charles Lyding <19598772+clydin@users.noreply.github.com>
Date: Mon, 20 Oct 2025 13:13:38 -0400
Subject: [PATCH] refactor(@angular/cli): use streaming HTML parser in search
 documentation tool

The `search_documentation` MCP tool previously used a regular expression and string searching to extract and clean documentation content from fetched HTML. This approach was not robust and could produce incorrect results. It also buffered the entire HTML response in memory before processing.

This commit refactors the implementation to use `parse5-html-rewriting-stream`, which is already a dependency in the workspace. The new implementation streams the `fetch` response directly into a single-pass parser that simultaneously extracts the `<main>` element's content and strips all HTML tags.

This change makes the parsing more reliable, efficient, and memory-friendly.
---
 packages/angular/cli/BUILD.bazel              |  1 +
 packages/angular/cli/package.json             |  1 +
 .../cli/src/commands/mcp/tools/doc-search.ts  | 88 ++++++++++---------
 pnpm-lock.yaml                                |  3 +
 4 files changed, 52 insertions(+), 41 deletions(-)
diff --git a/packages/angular/cli/BUILD.bazel b/packages/angular/cli/BUILD.bazel
index b0a4a1dee0ea..6cbb09b36c31 100644
--- a/packages/angular/cli/BUILD.bazel
+++ b/packages/angular/cli/BUILD.bazel
@@ -58,6 +58,7 @@ ts_project(
         ":node_modules/jsonc-parser",
         ":node_modules/npm-package-arg",
         ":node_modules/pacote",
+        ":node_modules/parse5-html-rewriting-stream",
         ":node_modules/resolve",
         ":node_modules/yargs",
         ":node_modules/zod",
diff --git a/packages/angular/cli/package.json b/packages/angular/cli/package.json
index 857c2a44a9a6..6cf48fadbb03 100644
--- a/packages/angular/cli/package.json
+++ b/packages/angular/cli/package.json
@@ -36,6 +36,7 @@
     "listr2": "9.0.4",
     "npm-package-arg": "13.0.1",
     "pacote": "21.0.3",
+    "parse5-html-rewriting-stream": "8.0.0",
     "resolve": "1.22.10",
     "semver": "7.7.3",
     "yargs": "18.0.0",
diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts
index 4c6831dbbaa0..e57d50c6f500 100644
--- a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts
+++ b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts
@@ -8,6 +8,7 @@
 
 import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch';
 import { createDecipheriv } from 'node:crypto';
+import { Readable } from 'node:stream';
 import { z } from 'zod';
 import { at, iv, k1 } from '../constants';
 import { McpToolContext, declareTool } from './tool-registry';
@@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) {
         // Only fetch content from angular.dev
         if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) {
           const response = await fetch(url);
-          if (response.ok) {
-            const html = await response.text();
-            const mainContent = extractMainContent(html);
-            if (mainContent) {
-              topContent = stripHtml(mainContent);
-            }
+          if (response.ok && response.body) {
+            topContent = await extractMainContent(
+              Readable.fromWeb(response.body, { encoding: 'utf-8' }),
+            );
           }
         }
       } catch (e) {
@@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) {
 }
 
 /**
- * Strips HTML tags from a string using a regular expression.
+ * Extracts the text content of the `<main>` element by streaming an HTML response.
  *
- * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however,
- * appropriate for this tool's specific use case because its input is always from a
- * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM).
- *
- * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching
- * an incomplete tag (e.g., `<script`).
- *
- * @param html The HTML string to strip.
- * @returns The text content of the HTML.
+ * @param htmlStream A readable stream of the HTML content of a page.
+ * @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found.
  */
-function stripHtml(html: string): string {
-  return html
-    .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&amp;/g, '&')
-    .trim();
-}
+async function extractMainContent(htmlStream: Readable): Promise<string | undefined> {
+  const { RewritingStream } = await import('parse5-html-rewriting-stream');
 
-/**
- * Extracts the content of the `<main>` element from an HTML string.
- *
- * @param html The HTML content of a page.
- * @returns The content of the `<main>` element, or `undefined` if not found.
- */
-function extractMainContent(html: string): string | undefined {
-  const mainTagStart = html.indexOf('<main');
-  if (mainTagStart === -1) {
-    return undefined;
-  }
+  const rewriter = new RewritingStream();
+  let mainTextContent = '';
+  let inMainElement = false;
+  let mainTagFound = false;
+
+  rewriter.on('startTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = true;
+      mainTagFound = true;
+    }
+  });
+
+  rewriter.on('endTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = false;
+    }
+  });
 
-  const mainTagEnd = html.lastIndexOf('</main>');
-  if (mainTagEnd <= mainTagStart) {
-    return undefined;
-  }
+  // Only capture text content, and only when inside the <main> element.
+  rewriter.on('text', (text) => {
+    if (inMainElement) {
+      mainTextContent += text.text;
+    }
+  });
+
+  return new Promise((resolve, reject) => {
+    htmlStream
+      .pipe(rewriter)
+      .on('finish', () => {
+        if (!mainTagFound) {
+          resolve(undefined);
+
+          return;
+        }
 
-  // Add 7 to include '</main>'
-  return html.substring(mainTagStart, mainTagEnd + 7);
+        resolve(mainTextContent.trim());
+      })
+      .on('error', reject);
+  });
 }
 
 /**
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 3b97a55c8d64..4f0f13c4cb26 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -504,6 +504,9 @@ importers:
       pacote:
         specifier: 21.0.3
         version: 21.0.3
+      parse5-html-rewriting-stream:
+        specifier: 8.0.0
+        version: 8.0.0
       resolve:
         specifier: 1.22.10
         version: 1.22.10