angular · clydin · Oct 21, 2025 · Oct 20, 2025
diff --git a/packages/angular/cli/BUILD.bazel b/packages/angular/cli/BUILD.bazel
@@ -58,6 +58,7 @@ ts_project(
         ":node_modules/jsonc-parser",
         ":node_modules/npm-package-arg",
         ":node_modules/pacote",
+        ":node_modules/parse5-html-rewriting-stream",
         ":node_modules/resolve",
         ":node_modules/yargs",
         ":node_modules/zod",

diff --git a/packages/angular/cli/package.json b/packages/angular/cli/package.json
@@ -36,6 +36,7 @@
     "listr2": "9.0.4",
     "npm-package-arg": "13.0.1",
     "pacote": "21.0.3",
+    "parse5-html-rewriting-stream": "8.0.0",
     "resolve": "1.22.10",
     "semver": "7.7.3",
     "yargs": "18.0.0",

diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts
@@ -8,6 +8,7 @@
 
 import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch';
 import { createDecipheriv } from 'node:crypto';
+import { Readable } from 'node:stream';
 import { z } from 'zod';
 import { at, iv, k1 } from '../constants';
 import { McpToolContext, declareTool } from './tool-registry';
@@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) {
         // Only fetch content from angular.dev
         if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) {
           const response = await fetch(url);
-          if (response.ok) {
-            const html = await response.text();
-            const mainContent = extractMainContent(html);
-            if (mainContent) {
-              topContent = stripHtml(mainContent);
-            }
+          if (response.ok && response.body) {
+            topContent = await extractMainContent(
+              Readable.fromWeb(response.body, { encoding: 'utf-8' }),
+            );
           }
         }
       } catch (e) {
@@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) {
 }
 
 /**
- * Strips HTML tags from a string using a regular expression.
+ * Extracts the text content of the `<main>` element by streaming an HTML response.
  *
- * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however,
- * appropriate for this tool's specific use case because its input is always from a
- * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM).
- *
- * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching
- * an incomplete tag (e.g., `<script`).
- *
- * @param html The HTML string to strip.
- * @returns The text content of the HTML.
+ * @param htmlStream A readable stream of the HTML content of a page.
+ * @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found.
  */
-function stripHtml(html: string): string {
-  return html
-    .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&amp;/g, '&')
-    .trim();
-}
+async function extractMainContent(htmlStream: Readable): Promise<string | undefined> {
+  const { RewritingStream } = await import('parse5-html-rewriting-stream');
 
-/**
- * Extracts the content of the `<main>` element from an HTML string.
- *
- * @param html The HTML content of a page.
- * @returns The content of the `<main>` element, or `undefined` if not found.
- */
-function extractMainContent(html: string): string | undefined {
-  const mainTagStart = html.indexOf('<main');
-  if (mainTagStart === -1) {
-    return undefined;
-  }
+  const rewriter = new RewritingStream();
+  let mainTextContent = '';
+  let inMainElement = false;
+  let mainTagFound = false;
+
+  rewriter.on('startTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = true;
+      mainTagFound = true;
+    }
+  });
+
+  rewriter.on('endTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = false;
+    }
+  });
 
-  const mainTagEnd = html.lastIndexOf('</main>');
-  if (mainTagEnd <= mainTagStart) {
-    return undefined;
-  }
+  // Only capture text content, and only when inside the <main> element.
+  rewriter.on('text', (text) => {
+    if (inMainElement) {
+      mainTextContent += text.text;
+    }
+  });
+
+  return new Promise((resolve, reject) => {
+    htmlStream
+      .pipe(rewriter)
+      .on('finish', () => {
+        if (!mainTagFound) {
+          resolve(undefined);
+
+          return;
+        }
 
-  // Add 7 to include '</main>'
-  return html.substring(mainTagStart, mainTagEnd + 7);
+        resolve(mainTextContent.trim());
+      })
+      .on('error', reject);
+  });
 }
 
 /**

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml