From 31867147ee9e3e7b40010950966c08be7f0fbe48 Mon Sep 17 00:00:00 2001 From: Charles Lyding <19598772+clydin@users.noreply.github.com> Date: Tue, 2 Sep 2025 14:07:14 -0400 Subject: [PATCH] refactor(@angular/cli): add logging and HTML removal to doc search tool This commit enhances the `search_documentation` MCP tool by improving its error handling and the quality of the data it returns. The key changes are: - **Error Logging:** The content fetching logic now logs a warning if it fails to retrieve or parse the documentation page, providing better visibility for debugging without crashing the tool. - **HTML Removal:** The fetched HTML content now has all tags removed, providing the AI with clean, plain-text content. This reduces noise, lowers the token count, and improves the quality of the input for the language model. --- .../cli/src/commands/mcp/tools/doc-search.ts | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts index 1e5b92b21797..da34f97b8b4d 100644 --- a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts +++ b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts @@ -10,7 +10,7 @@ import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch'; import { createDecipheriv } from 'node:crypto'; import { z } from 'zod'; import { at, iv, k1 } from '../constants'; -import { declareTool } from './tool-registry'; +import { McpToolContext, declareTool } from './tool-registry'; const ALGOLIA_APP_ID = 'L1XWT2UJ7F'; // https://www.algolia.com/doc/guides/security/api-keys/#search-only-api-key @@ -84,7 +84,7 @@ tutorials, concepts, and best practices. factory: createDocSearchHandler, }); -function createDocSearchHandler() { +function createDocSearchHandler({ logger }: McpToolContext) { let client: import('algoliasearch').SearchClient | undefined; return async ({ query, includeTopContent }: DocSearchInput) => { @@ -124,21 +124,23 @@ function createDocSearchHandler() { const { title: topTitle, breadcrumb: topBreadcrumb } = formatHitToParts(topHit); let topContent: string | undefined; - try { - if (includeTopContent && typeof topHit.url === 'string') { - const url = new URL(topHit.url); - + if (includeTopContent && typeof topHit.url === 'string') { + const url = new URL(topHit.url); + try { // Only fetch content from angular.dev if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) { const response = await fetch(url); if (response.ok) { const html = await response.text(); - topContent = extractMainContent(html); + const mainContent = extractMainContent(html); + if (mainContent) { + topContent = stripHtml(mainContent); + } } } + } catch (e) { + logger.warn(`Failed to fetch or parse content from ${url}: ${e}`); } - } catch { - // Ignore errors fetching content } structuredResults.push({ @@ -175,6 +177,22 @@ function createDocSearchHandler() { }; } +/** + * Strips HTML tags from a string. + * @param html The HTML string to strip. + * @returns The text content of the HTML. + */ +function stripHtml(html: string): string { + // This is a basic regex to remove HTML tags. + // It also decodes common HTML entities. + return html + .replace(/<[^>]*>/g, '') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .trim(); +} + /** * Extracts the content of the `
` element from an HTML string. *