Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/angular/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ ts_project(
":node_modules/jsonc-parser",
":node_modules/npm-package-arg",
":node_modules/pacote",
":node_modules/parse5-html-rewriting-stream",
":node_modules/resolve",
":node_modules/yargs",
":node_modules/zod",
Expand Down
1 change: 1 addition & 0 deletions packages/angular/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"listr2": "9.0.4",
"npm-package-arg": "13.0.1",
"pacote": "21.0.3",
"parse5-html-rewriting-stream": "8.0.0",
"resolve": "1.22.10",
"semver": "7.7.3",
"yargs": "18.0.0",
Expand Down
88 changes: 47 additions & 41 deletions packages/angular/cli/src/commands/mcp/tools/doc-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch';
import { createDecipheriv } from 'node:crypto';
import { Readable } from 'node:stream';
import { z } from 'zod';
import { at, iv, k1 } from '../constants';
import { McpToolContext, declareTool } from './tool-registry';
Expand Down Expand Up @@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) {
// Only fetch content from angular.dev
if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) {
const response = await fetch(url);
if (response.ok) {
const html = await response.text();
const mainContent = extractMainContent(html);
if (mainContent) {
topContent = stripHtml(mainContent);
}
if (response.ok && response.body) {
topContent = await extractMainContent(
Readable.fromWeb(response.body, { encoding: 'utf-8' }),
);
}
}
} catch (e) {
Expand Down Expand Up @@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) {
}

/**
* Strips HTML tags from a string using a regular expression.
* Extracts the text content of the `<main>` element by streaming an HTML response.
*
* NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however,
* appropriate for this tool's specific use case because its input is always from a
* trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM).
*
* The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching
* an incomplete tag (e.g., `<script`).
*
* @param html The HTML string to strip.
* @returns The text content of the HTML.
* @param htmlStream A readable stream of the HTML content of a page.
* @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found.
*/
function stripHtml(html: string): string {
return html
.replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.trim();
}
async function extractMainContent(htmlStream: Readable): Promise<string | undefined> {
const { RewritingStream } = await import('parse5-html-rewriting-stream');

/**
* Extracts the content of the `<main>` element from an HTML string.
*
* @param html The HTML content of a page.
* @returns The content of the `<main>` element, or `undefined` if not found.
*/
function extractMainContent(html: string): string | undefined {
const mainTagStart = html.indexOf('<main');
if (mainTagStart === -1) {
return undefined;
}
const rewriter = new RewritingStream();
let mainTextContent = '';
let inMainElement = false;
let mainTagFound = false;

rewriter.on('startTag', (tag) => {
if (tag.tagName === 'main') {
inMainElement = true;
mainTagFound = true;
}
});

rewriter.on('endTag', (tag) => {
if (tag.tagName === 'main') {
inMainElement = false;
}
});

const mainTagEnd = html.lastIndexOf('</main>');
if (mainTagEnd <= mainTagStart) {
return undefined;
}
// Only capture text content, and only when inside the <main> element.
rewriter.on('text', (text) => {
if (inMainElement) {
mainTextContent += text.text;
}
});

return new Promise((resolve, reject) => {
htmlStream
.pipe(rewriter)
.on('finish', () => {
if (!mainTagFound) {
resolve(undefined);

return;
}

// Add 7 to include '</main>'
return html.substring(mainTagStart, mainTagEnd + 7);
resolve(mainTextContent.trim());
})
.on('error', reject);
});
}

/**
Expand Down
3 changes: 3 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.